Skip to content

Commit 7b0d56b

Browse files
AMDGPU/GlobalISel: Fix inst-selection of ballot (#109986)
Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input.
1 parent 42ec740 commit 7b0d56b

File tree

8 files changed

+360
-41
lines changed

8 files changed

+360
-41
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
13691369
sign-extended from the width of the underlying PC hardware register even on
13701370
processors where the s_getpc_b64 instruction returns a zero-extended value.
13711371

1372+
llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument
1373+
in all active lanes, and zero in all inactive lanes.
1374+
Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
1375+
used by hardware to control active lanes when used in EXEC register.
1376+
For example, ballot(i1 true) return EXEC mask.
1377+
13721378
============================================== ==========================================================
13731379

13741380
.. TODO::

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp :
20862086
[IntrNoMem, IntrConvergent,
20872087
ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
20882088

2089+
// Returns a bitfield(i32 or i64) containing the result of its i1 argument
2090+
// in all active lanes, and zero in all inactive lanes.
20892091
def int_amdgcn_ballot :
20902092
Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
20912093
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 76 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,50 +1413,101 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
14131413
return true;
14141414
}
14151415

1416+
// Ballot has to zero bits in input lane-mask that are zero in current exec,
1417+
// Done as AND with exec. For inputs that are results of instruction that
1418+
// implicitly use same exec, for example compares in same basic block or SCC to
1419+
// VCC copy, use copy.
1420+
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1421+
MachineBasicBlock *MBB) {
1422+
MachineInstr *MI = MRI.getVRegDef(Reg);
1423+
if (MI->getParent() != MBB)
1424+
return false;
1425+
1426+
// Lane mask generated by SCC to VCC copy.
1427+
if (MI->getOpcode() == AMDGPU::COPY) {
1428+
auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1429+
auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1430+
if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1431+
SrcRB->getID() == AMDGPU::SGPRRegBankID)
1432+
return true;
1433+
}
1434+
1435+
// Lane mask generated using compare with same exec.
1436+
if (isa<GAnyCmp>(MI))
1437+
return true;
1438+
1439+
Register LHS, RHS;
1440+
// Look through AND.
1441+
if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1442+
return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1443+
isLaneMaskFromSameBlock(RHS, MRI, MBB);
1444+
1445+
return false;
1446+
}
1447+
14161448
bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
14171449
MachineBasicBlock *BB = I.getParent();
14181450
const DebugLoc &DL = I.getDebugLoc();
14191451
Register DstReg = I.getOperand(0).getReg();
1420-
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1421-
const bool Is64 = Size == 64;
1422-
const bool IsWave32 = (STI.getWavefrontSize() == 32);
1452+
Register SrcReg = I.getOperand(2).getReg();
1453+
const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1454+
const unsigned WaveSize = STI.getWavefrontSize();
14231455

14241456
// In the common case, the return type matches the wave size.
14251457
// However we also support emitting i64 ballots in wave32 mode.
1426-
if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1458+
if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
14271459
return false;
14281460

14291461
std::optional<ValueAndVReg> Arg =
1430-
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1462+
getIConstantVRegValWithLookThrough(SrcReg, *MRI);
1463+
1464+
Register Dst = DstReg;
1465+
// i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1466+
if (BallotSize != WaveSize) {
1467+
Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1468+
}
14311469

1432-
const auto BuildCopy = [&](Register SrcReg) {
1433-
if (Size == STI.getWavefrontSize()) {
1434-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1435-
.addReg(SrcReg);
1436-
return;
1470+
if (Arg) {
1471+
const int64_t Value = Arg->Value.getZExtValue();
1472+
if (Value == 0) {
1473+
// Dst = S_MOV 0
1474+
unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1475+
BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1476+
} else {
1477+
// Dst = COPY EXEC
1478+
assert(Value == 1);
1479+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
14371480
}
1481+
if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1482+
return false;
1483+
} else {
1484+
if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1485+
// Dst = COPY SrcReg
1486+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1487+
if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1488+
return false;
1489+
} else {
1490+
// Dst = S_AND SrcReg, EXEC
1491+
unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1492+
auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1493+
.addReg(SrcReg)
1494+
.addReg(TRI.getExec())
1495+
.setOperandDead(3); // Dead scc
1496+
if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1497+
return false;
1498+
}
1499+
}
14381500

1439-
// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1501+
// i64 ballot on Wave32: zero-extend i32 ballot to i64.
1502+
if (BallotSize != WaveSize) {
14401503
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
14411504
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
14421505
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1443-
.addReg(SrcReg)
1506+
.addReg(Dst)
14441507
.addImm(AMDGPU::sub0)
14451508
.addReg(HiReg)
14461509
.addImm(AMDGPU::sub1);
1447-
};
1448-
1449-
if (Arg) {
1450-
const int64_t Value = Arg->Value.getSExtValue();
1451-
if (Value == 0) {
1452-
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1453-
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1454-
} else if (Value == -1) // all ones
1455-
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1456-
else
1457-
return false;
1458-
} else
1459-
BuildCopy(I.getOperand(2).getReg());
1510+
}
14601511

14611512
I.eraseFromParent();
14621513
return true;

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
44

55
declare i32 @llvm.amdgcn.ballot.i32(i1)
66
declare i32 @llvm.ctpop.i32(i32)
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
3333
; CHECK-LABEL: non_compare:
3434
; CHECK: ; %bb.0:
3535
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
36-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
36+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
37+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
3738
; CHECK-NEXT: ; return to shader part epilog
3839
%trunc = trunc i32 %x to i1
3940
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8990
; CHECK: ; %bb.0:
9091
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9192
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
93+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
94+
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
9395
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
9496
; CHECK-NEXT: ; %bb.1: ; %true
9597
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -137,7 +139,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137139
; CHECK: ; %bb.0:
138140
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139141
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
142+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
143+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
141144
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
142145
; CHECK-NEXT: ; %bb.1: ; %false
143146
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -419,3 +422,80 @@ true:
419422
false:
420423
ret i32 33
421424
}
425+
426+
; Input that is not constant or direct result of a compare.
427+
; Tests setting 0 to inactive lanes.
428+
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
429+
; GFX10-LABEL: non_cst_non_compare_input:
430+
; GFX10: ; %bb.0: ; %entry
431+
; GFX10-NEXT: s_and_b32 s0, 1, s0
432+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
433+
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
434+
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
435+
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
436+
; GFX10-NEXT: ; %bb.1: ; %B
437+
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
438+
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
439+
; GFX10-NEXT: ; implicit-def: $vgpr2
440+
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
441+
; GFX10-NEXT: s_or_b32 s0, s0, s2
442+
; GFX10-NEXT: ; %bb.2: ; %Flow
443+
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
444+
; GFX10-NEXT: ; %bb.3: ; %A
445+
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
446+
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
447+
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
448+
; GFX10-NEXT: s_or_b32 s0, s0, s2
449+
; GFX10-NEXT: ; %bb.4: ; %exit
450+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
451+
; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
452+
; GFX10-NEXT: v_mov_b32_e32 v2, s0
453+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
454+
; GFX10-NEXT: s_endpgm
455+
;
456+
; GFX11-LABEL: non_cst_non_compare_input:
457+
; GFX11: ; %bb.0: ; %entry
458+
; GFX11-NEXT: s_and_b32 s0, 1, s0
459+
; GFX11-NEXT: s_mov_b32 s1, exec_lo
460+
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
461+
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
462+
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
463+
; GFX11-NEXT: ; %bb.1: ; %B
464+
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
465+
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
466+
; GFX11-NEXT: ; implicit-def: $vgpr2
467+
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
468+
; GFX11-NEXT: s_or_b32 s0, s0, s2
469+
; GFX11-NEXT: ; %bb.2: ; %Flow
470+
; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
471+
; GFX11-NEXT: ; %bb.3: ; %A
472+
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
473+
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
474+
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
475+
; GFX11-NEXT: s_or_b32 s0, s0, s2
476+
; GFX11-NEXT: ; %bb.4: ; %exit
477+
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
478+
; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
479+
; GFX11-NEXT: v_mov_b32_e32 v2, s0
480+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
481+
; GFX11-NEXT: s_nop 0
482+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
483+
; GFX11-NEXT: s_endpgm
484+
entry:
485+
%cmp = icmp eq i32 %cond, 0
486+
br i1 %cmp, label %A, label %B
487+
488+
A:
489+
%val_A = icmp uge i32 %tid, 1
490+
br label %exit
491+
492+
B:
493+
%val_B = icmp ult i32 %tid, 2
494+
br label %exit
495+
496+
exit:
497+
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
498+
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
499+
store i32 %ballot, ptr addrspace(1) %out
500+
ret void
501+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
3434
; CHECK-LABEL: non_compare:
3535
; CHECK: ; %bb.0:
3636
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
37-
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
37+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
38+
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
3839
; CHECK-NEXT: ; return to shader part epilog
3940
%trunc = trunc i32 %x to i1
4041
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
9293
; CHECK: ; %bb.0:
9394
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9495
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
95-
; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
96+
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
97+
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
9698
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
9799
; CHECK-NEXT: ; %bb.1: ; %true
98100
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -140,7 +142,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
140142
; CHECK: ; %bb.0:
141143
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
142144
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
143-
; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
145+
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
146+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
144147
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
145148
; CHECK-NEXT: ; %bb.1: ; %false
146149
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -422,3 +425,52 @@ true:
422425
false:
423426
ret i32 33
424427
}
428+
429+
; Input that is not constant or direct result of a compare.
430+
; Tests setting 0 to inactive lanes.
431+
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
432+
; CHECK-LABEL: non_cst_non_compare_input:
433+
; CHECK: ; %bb.0: ; %entry
434+
; CHECK-NEXT: s_and_b32 s0, 1, s0
435+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
436+
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
437+
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
438+
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
439+
; CHECK-NEXT: ; %bb.1: ; %B
440+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
441+
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
442+
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
443+
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
444+
; CHECK-NEXT: ; implicit-def: $vgpr2
445+
; CHECK-NEXT: ; %bb.2: ; %Flow
446+
; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
447+
; CHECK-NEXT: ; %bb.3: ; %A
448+
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2
449+
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
450+
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
451+
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
452+
; CHECK-NEXT: ; %bb.4: ; %exit
453+
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
454+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
455+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
456+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
457+
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
458+
; CHECK-NEXT: s_endpgm
459+
entry:
460+
%cmp = icmp eq i32 %cond, 0
461+
br i1 %cmp, label %A, label %B
462+
463+
A:
464+
%val_A = icmp uge i32 %tid, 1
465+
br label %exit
466+
467+
B:
468+
%val_B = icmp ult i32 %tid, 2
469+
br label %exit
470+
471+
exit:
472+
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
473+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
474+
store i64 %ballot, ptr addrspace(1) %out
475+
ret void
476+
}

0 commit comments

Comments
 (0)