Skip to content

Commit 0c0e21b

Browse files
AMDGPU/GlobalISel: Fix inst-selection of ballot
Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input.
1 parent 6d8e966 commit 0c0e21b

File tree

8 files changed

+360
-41
lines changed

8 files changed

+360
-41
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
13691369
sign-extended from the width of the underlying PC hardware register even on
13701370
processors where the s_getpc_b64 instruction returns a zero-extended value.
13711371

1372+
llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument
1373+
in all active lanes, and zero in all inactive lanes.
1374+
Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
1375+
used by hardware to control active lanes when used in EXEC register.
1376+
For example, ballot(i1 true) return EXEC mask.
1377+
13721378
============================================== ==========================================================
13731379

13741380
.. TODO::

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp :
20862086
[IntrNoMem, IntrConvergent,
20872087
ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
20882088

2089+
// Returns a bitfield(i32 or i64) containing the result of its i1 argument
2090+
// in all active lanes, and zero in all inactive lanes.
20892091
def int_amdgcn_ballot :
20902092
Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
20912093
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 72 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,50 +1413,97 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
14131413
return true;
14141414
}
14151415

1416+
// Ballot has to zero bits in input lane-mask that are zero in current exec,
1417+
// Done as AND with exec. For inputs that are results of instruction that
1418+
// implicitly use same exec, for example compares in same basic block, use copy.
1419+
bool isBallotCopy(Register Reg, MachineRegisterInfo &MRI,
1420+
MachineBasicBlock *MBB) {
1421+
MachineInstr *MI = MRI.getVRegDef(Reg);
1422+
// Look through copies, truncs and anyext. TODO: just copies
1423+
while (MI->getOpcode() == AMDGPU::COPY ||
1424+
MI->getOpcode() == AMDGPU::G_TRUNC ||
1425+
MI->getOpcode() == AMDGPU::G_ANYEXT) {
1426+
Reg = MI->getOperand(1).getReg();
1427+
if (!Reg.isVirtual())
1428+
return false;
1429+
MI = MRI.getVRegDef(Reg);
1430+
}
1431+
1432+
// Lane mask generated using compare with same exec.
1433+
if (isa<GAnyCmp>(MI) && MI->getParent() == MBB)
1434+
return true;
1435+
1436+
Register LHS, RHS;
1437+
// Look through AND.
1438+
if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1439+
return isBallotCopy(LHS, MRI, MBB) || isBallotCopy(RHS, MRI, MBB);
1440+
1441+
return false;
1442+
}
1443+
14161444
bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
14171445
MachineBasicBlock *BB = I.getParent();
14181446
const DebugLoc &DL = I.getDebugLoc();
14191447
Register DstReg = I.getOperand(0).getReg();
1420-
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1421-
const bool Is64 = Size == 64;
1422-
const bool IsWave32 = (STI.getWavefrontSize() == 32);
1448+
Register SrcReg = I.getOperand(2).getReg();
1449+
const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1450+
const unsigned WaveSize = STI.getWavefrontSize();
14231451

14241452
// In the common case, the return type matches the wave size.
14251453
// However we also support emitting i64 ballots in wave32 mode.
1426-
if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1454+
if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
14271455
return false;
14281456

14291457
std::optional<ValueAndVReg> Arg =
1430-
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1458+
getIConstantVRegValWithLookThrough(SrcReg, *MRI);
1459+
1460+
Register Dst = DstReg;
1461+
// i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1462+
if (BallotSize != WaveSize) {
1463+
Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1464+
}
14311465

1432-
const auto BuildCopy = [&](Register SrcReg) {
1433-
if (Size == STI.getWavefrontSize()) {
1434-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1435-
.addReg(SrcReg);
1436-
return;
1466+
if (Arg) {
1467+
const int64_t Value = Arg->Value.getZExtValue();
1468+
if (Value == 0) {
1469+
// Dst = S_MOV 0
1470+
unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1471+
BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1472+
} else {
1473+
// Dst = COPY EXEC
1474+
assert(Value == 1);
1475+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
14371476
}
1477+
if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1478+
return false;
1479+
} else {
1480+
if (isBallotCopy(SrcReg, *MRI, BB)) {
1481+
// Dst = COPY SrcReg
1482+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1483+
if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1484+
return false;
1485+
} else {
1486+
// Dst = S_AND SrcReg, EXEC
1487+
unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1488+
auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1489+
.addReg(SrcReg)
1490+
.addReg(TRI.getExec())
1491+
.setOperandDead(3); // Dead scc
1492+
if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1493+
return false;
1494+
}
1495+
}
14381496

1439-
// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1497+
// i64 ballot on Wave32: zero-extend i32 ballot to i64.
1498+
if (BallotSize != WaveSize) {
14401499
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
14411500
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
14421501
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1443-
.addReg(SrcReg)
1502+
.addReg(Dst)
14441503
.addImm(AMDGPU::sub0)
14451504
.addReg(HiReg)
14461505
.addImm(AMDGPU::sub1);
1447-
};
1448-
1449-
if (Arg) {
1450-
const int64_t Value = Arg->Value.getSExtValue();
1451-
if (Value == 0) {
1452-
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1453-
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1454-
} else if (Value == -1) // all ones
1455-
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1456-
else
1457-
return false;
1458-
} else
1459-
BuildCopy(I.getOperand(2).getReg());
1506+
}
14601507

14611508
I.eraseFromParent();
14621509
return true;

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
44

55
declare i32 @llvm.amdgcn.ballot.i32(i1)
66
declare i32 @llvm.ctpop.i32(i32)
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
3333
; CHECK-LABEL: non_compare:
3434
; CHECK: ; %bb.0:
3535
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
36-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
36+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
37+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
3738
; CHECK-NEXT: ; return to shader part epilog
3839
%trunc = trunc i32 %x to i1
3940
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8990
; CHECK: ; %bb.0:
9091
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9192
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
93+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
94+
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
9395
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
9496
; CHECK-NEXT: ; %bb.1: ; %true
9597
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +115,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
113115
; CHECK: ; %bb.0:
114116
; CHECK-NEXT: s_and_b32 s0, 1, s0
115117
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
118+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
116119
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
117120
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
118121
; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +140,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137140
; CHECK: ; %bb.0:
138141
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139142
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
143+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
144+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
141145
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
142146
; CHECK-NEXT: ; %bb.1: ; %false
143147
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +165,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
161165
; CHECK: ; %bb.0:
162166
; CHECK-NEXT: s_and_b32 s0, 1, s0
163167
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
168+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
164169
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
165170
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
166171
; CHECK-NEXT: ; %bb.1: ; %false
@@ -419,3 +424,80 @@ true:
419424
false:
420425
ret i32 33
421426
}
427+
428+
; Input that is not constant or direct result of a compare.
429+
; Tests setting 0 to inactive lanes.
430+
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
431+
; GFX10-LABEL: non_cst_non_compare_input:
432+
; GFX10: ; %bb.0: ; %entry
433+
; GFX10-NEXT: s_and_b32 s0, 1, s0
434+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
435+
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
436+
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
437+
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
438+
; GFX10-NEXT: ; %bb.1: ; %B
439+
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
440+
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
441+
; GFX10-NEXT: ; implicit-def: $vgpr2
442+
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
443+
; GFX10-NEXT: s_or_b32 s0, s0, s2
444+
; GFX10-NEXT: ; %bb.2: ; %Flow
445+
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
446+
; GFX10-NEXT: ; %bb.3: ; %A
447+
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
448+
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
449+
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
450+
; GFX10-NEXT: s_or_b32 s0, s0, s2
451+
; GFX10-NEXT: ; %bb.4: ; %exit
452+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
453+
; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
454+
; GFX10-NEXT: v_mov_b32_e32 v2, s0
455+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
456+
; GFX10-NEXT: s_endpgm
457+
;
458+
; GFX11-LABEL: non_cst_non_compare_input:
459+
; GFX11: ; %bb.0: ; %entry
460+
; GFX11-NEXT: s_and_b32 s0, 1, s0
461+
; GFX11-NEXT: s_mov_b32 s1, exec_lo
462+
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
463+
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
464+
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
465+
; GFX11-NEXT: ; %bb.1: ; %B
466+
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
467+
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
468+
; GFX11-NEXT: ; implicit-def: $vgpr2
469+
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
470+
; GFX11-NEXT: s_or_b32 s0, s0, s2
471+
; GFX11-NEXT: ; %bb.2: ; %Flow
472+
; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
473+
; GFX11-NEXT: ; %bb.3: ; %A
474+
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
475+
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
476+
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
477+
; GFX11-NEXT: s_or_b32 s0, s0, s2
478+
; GFX11-NEXT: ; %bb.4: ; %exit
479+
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
480+
; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
481+
; GFX11-NEXT: v_mov_b32_e32 v2, s0
482+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
483+
; GFX11-NEXT: s_nop 0
484+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
485+
; GFX11-NEXT: s_endpgm
486+
entry:
487+
%cmp = icmp eq i32 %cond, 0
488+
br i1 %cmp, label %A, label %B
489+
490+
A:
491+
%val_A = icmp uge i32 %tid, 1
492+
br label %exit
493+
494+
B:
495+
%val_B = icmp ult i32 %tid, 2
496+
br label %exit
497+
498+
exit:
499+
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
500+
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
501+
store i32 %ballot, ptr addrspace(1) %out
502+
ret void
503+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
3434
; CHECK-LABEL: non_compare:
3535
; CHECK: ; %bb.0:
3636
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
37-
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
37+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
38+
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
3839
; CHECK-NEXT: ; return to shader part epilog
3940
%trunc = trunc i32 %x to i1
4041
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
9293
; CHECK: ; %bb.0:
9394
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9495
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
95-
; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
96+
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
97+
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
9698
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
9799
; CHECK-NEXT: ; %bb.1: ; %true
98100
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -116,6 +118,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
116118
; CHECK: ; %bb.0:
117119
; CHECK-NEXT: s_and_b32 s0, 1, s0
118120
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
121+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
119122
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
120123
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
121124
; CHECK-NEXT: ; %bb.1: ; %true
@@ -140,7 +143,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
140143
; CHECK: ; %bb.0:
141144
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
142145
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
143-
; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
146+
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
147+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
144148
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
145149
; CHECK-NEXT: ; %bb.1: ; %false
146150
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -164,6 +168,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
164168
; CHECK: ; %bb.0:
165169
; CHECK-NEXT: s_and_b32 s0, 1, s0
166170
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
171+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
167172
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
168173
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
169174
; CHECK-NEXT: ; %bb.1: ; %false
@@ -422,3 +427,52 @@ true:
422427
false:
423428
ret i32 33
424429
}
430+
431+
; Input that is not constant or direct result of a compare.
432+
; Tests setting 0 to inactive lanes.
433+
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
434+
; CHECK-LABEL: non_cst_non_compare_input:
435+
; CHECK: ; %bb.0: ; %entry
436+
; CHECK-NEXT: s_and_b32 s0, 1, s0
437+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
438+
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
439+
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
440+
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
441+
; CHECK-NEXT: ; %bb.1: ; %B
442+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
443+
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
444+
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
445+
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
446+
; CHECK-NEXT: ; implicit-def: $vgpr2
447+
; CHECK-NEXT: ; %bb.2: ; %Flow
448+
; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
449+
; CHECK-NEXT: ; %bb.3: ; %A
450+
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2
451+
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
452+
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
453+
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
454+
; CHECK-NEXT: ; %bb.4: ; %exit
455+
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
456+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
457+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
458+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
459+
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
460+
; CHECK-NEXT: s_endpgm
461+
entry:
462+
%cmp = icmp eq i32 %cond, 0
463+
br i1 %cmp, label %A, label %B
464+
465+
A:
466+
%val_A = icmp uge i32 %tid, 1
467+
br label %exit
468+
469+
B:
470+
%val_B = icmp ult i32 %tid, 2
471+
br label %exit
472+
473+
exit:
474+
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
475+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
476+
store i64 %ballot, ptr addrspace(1) %out
477+
ret void
478+
}

0 commit comments

Comments
 (0)