Skip to content

Commit 50ab573

Browse files
committed
per comments:
* supported negated ballot * improved comments
1 parent 8b836e2 commit 50ab573

File tree

5 files changed

+247
-12
lines changed

5 files changed

+247
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2020
#include "MCTargetDesc/R600MCTargetDesc.h"
2121
#include "R600RegisterInfo.h"
22+
#include "SIISelLowering.h"
2223
#include "SIMachineFunctionInfo.h"
2324
#include "llvm/Analysis/UniformityAnalysis.h"
2425
#include "llvm/Analysis/ValueTracking.h"
@@ -2259,26 +2260,30 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
22592260
return false;
22602261
}
22612262

2262-
bool isBoolSGPR(SDValue V);
2263-
2264-
static SDValue combineBallotPattern(SDValue VCMP) {
2263+
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
22652264
assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
22662265
// Special case for amdgcn.ballot:
22672266
// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2268-
// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne ; lowered ballot
2267+
// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
22692268
// =>
22702269
// Use i1 %Cond value instead of i(WaveSize) %VCMP.
22712270
// This is possible because divergent ISD::SETCC is selected as V_CMP and
22722271
// Cond becomes a i(WaveSize) full mask value.
2272+
// Note that ballot doesn't use SETEQ condition but its easy to support it
2273+
// here for completeness, so in this case Negate is set true on return.
22732274
auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
22742275
auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
2275-
if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
2276+
if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && VCMP_CRHS &&
2277+
VCMP_CRHS->isZero()) {
2278+
22762279
auto Cond = VCMP.getOperand(0);
22772280
if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
22782281
Cond = Cond.getOperand(0);
22792282

2280-
if (isBoolSGPR(Cond))
2283+
if (isBoolSGPR(Cond)) {
2284+
Negate = VCMP_CC == ISD::SETEQ;
22812285
return Cond;
2286+
}
22822287
}
22832288
return SDValue();
22842289
}
@@ -2306,14 +2311,18 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
23062311
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
23072312
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
23082313
// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2314+
// BRCOND i1 %C, %BB
23092315
// =>
2310-
// Use "i(WaveSize) %VCMP value in VCC register ne/eq zero" as the branch
2311-
// condition.
2316+
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2317+
// VCC = COPY i(WaveSize) %VCMP
2318+
// S_CBRANCH_VCCNZ/VCCZ %BB
23122319
Negate = CC == ISD::SETEQ;
23132320
auto VCMP = Cond->getOperand(0);
2314-
if (auto BallotCond = combineBallotPattern(VCMP)) {
2321+
bool NegatedBallot = false;
2322+
if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
23152323
Cond = BallotCond;
23162324
UseSCCBr = !BallotCond->isDivergent();
2325+
Negate = Negate ^ NegatedBallot;
23172326
} else {
23182327
// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
23192328
// selected as V_CMP, but this may change for uniform condition.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10499,9 +10499,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
1049910499
return SDValue();
1050010500
}
1050110501

10502-
// Returns true if argument is a boolean value which is not serialized into
10503-
// memory or argument and does not require v_cndmask_b32 to be deserialized.
10504-
bool isBoolSGPR(SDValue V) {
10502+
bool llvm::isBoolSGPR(SDValue V) {
1050510503
if (V.getValueType() != MVT::i1)
1050610504
return false;
1050710505
switch (V.getOpcode()) {

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
586586
getTargetMMOFlags(const Instruction &I) const override;
587587
};
588588

589+
// Returns true if argument is a boolean value which is not serialized into
590+
// memory or argument and does not require v_cndmask_b32 to be deserialized.
591+
bool isBoolSGPR(SDValue V);
592+
589593
} // End namespace llvm
590594

591595
#endif

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,3 +396,115 @@ true:
396396
false:
397397
ret i32 33
398398
}
399+
400+
declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
401+
402+
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
403+
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
404+
; CHECK: ; %bb.0:
405+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
406+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
407+
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
408+
; CHECK-NEXT: s_cbranch_vccnz .LBB20_2
409+
; CHECK-NEXT: ; %bb.1: ; %true
410+
; CHECK-NEXT: s_mov_b32 s0, 42
411+
; CHECK-NEXT: s_branch .LBB20_3
412+
; CHECK-NEXT: .LBB20_2: ; %false
413+
; CHECK-NEXT: s_mov_b32 s0, 33
414+
; CHECK-NEXT: s_branch .LBB20_3
415+
; CHECK-NEXT: .LBB20_3:
416+
%v1c = icmp ult i32 %v1, 12
417+
%v2c = icmp ugt i32 %v2, 34
418+
%c = and i1 %v1c, %v2c
419+
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
420+
%ballot_ne_zero = icmp ne i32 %ballot, 0
421+
br i1 %ballot_ne_zero, label %true, label %false
422+
true:
423+
ret i32 42
424+
false:
425+
ret i32 33
426+
}
427+
428+
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
429+
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
430+
; CHECK: ; %bb.0:
431+
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
432+
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
433+
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
434+
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
435+
; CHECK-NEXT: s_and_b32 s0, s0, s1
436+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
437+
; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
438+
; CHECK-NEXT: ; %bb.1: ; %true
439+
; CHECK-NEXT: s_mov_b32 s0, 42
440+
; CHECK-NEXT: s_branch .LBB21_3
441+
; CHECK-NEXT: .LBB21_2: ; %false
442+
; CHECK-NEXT: s_mov_b32 s0, 33
443+
; CHECK-NEXT: s_branch .LBB21_3
444+
; CHECK-NEXT: .LBB21_3:
445+
%v1c = icmp ult i32 %v1, 12
446+
%v2c = icmp ugt i32 %v2, 34
447+
%c = and i1 %v1c, %v2c
448+
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
449+
%ballot_ne_zero = icmp ne i32 %ballot, 0
450+
br i1 %ballot_ne_zero, label %true, label %false
451+
true:
452+
ret i32 42
453+
false:
454+
ret i32 33
455+
}
456+
457+
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
458+
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
459+
; CHECK: ; %bb.0:
460+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
461+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
462+
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
463+
; CHECK-NEXT: s_cbranch_vccnz .LBB22_2
464+
; CHECK-NEXT: ; %bb.1: ; %false
465+
; CHECK-NEXT: s_mov_b32 s0, 33
466+
; CHECK-NEXT: s_branch .LBB22_3
467+
; CHECK-NEXT: .LBB22_2: ; %true
468+
; CHECK-NEXT: s_mov_b32 s0, 42
469+
; CHECK-NEXT: s_branch .LBB22_3
470+
; CHECK-NEXT: .LBB22_3:
471+
%v1c = icmp ult i32 %v1, 12
472+
%v2c = icmp ugt i32 %v2, 34
473+
%c = and i1 %v1c, %v2c
474+
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
475+
%ballot_eq_zero = icmp eq i32 %ballot, 0
476+
br i1 %ballot_eq_zero, label %true, label %false
477+
true:
478+
ret i32 42
479+
false:
480+
ret i32 33
481+
}
482+
483+
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
484+
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
485+
; CHECK: ; %bb.0:
486+
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
487+
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
488+
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
489+
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
490+
; CHECK-NEXT: s_and_b32 s0, s0, s1
491+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
492+
; CHECK-NEXT: s_cbranch_scc1 .LBB23_2
493+
; CHECK-NEXT: ; %bb.1: ; %false
494+
; CHECK-NEXT: s_mov_b32 s0, 33
495+
; CHECK-NEXT: s_branch .LBB23_3
496+
; CHECK-NEXT: .LBB23_2: ; %true
497+
; CHECK-NEXT: s_mov_b32 s0, 42
498+
; CHECK-NEXT: s_branch .LBB23_3
499+
; CHECK-NEXT: .LBB23_3:
500+
%v1c = icmp ult i32 %v1, 12
501+
%v2c = icmp ugt i32 %v2, 34
502+
%c = and i1 %v1c, %v2c
503+
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
504+
%ballot_eq_zero = icmp eq i32 %ballot, 0
505+
br i1 %ballot_eq_zero, label %true, label %false
506+
true:
507+
ret i32 42
508+
false:
509+
ret i32 33
510+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,3 +399,115 @@ true:
399399
false:
400400
ret i32 33
401401
}
402+
403+
declare i64 @llvm.amdgcn.icmp.i64(i1, i1, i32)
404+
405+
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
406+
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
407+
; CHECK: ; %bb.0:
408+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
409+
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
410+
; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
411+
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
412+
; CHECK-NEXT: ; %bb.1: ; %true
413+
; CHECK-NEXT: s_mov_b32 s0, 42
414+
; CHECK-NEXT: s_branch .LBB0_3
415+
; CHECK-NEXT: .LBB0_2: ; %false
416+
; CHECK-NEXT: s_mov_b32 s0, 33
417+
; CHECK-NEXT: s_branch .LBB0_3
418+
; CHECK-NEXT: .LBB0_3:
419+
%v1c = icmp ult i32 %v1, 12
420+
%v2c = icmp ugt i32 %v2, 34
421+
%c = and i1 %v1c, %v2c
422+
%ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
423+
%ballot_ne_zero = icmp ne i64 %ballot, 0
424+
br i1 %ballot_ne_zero, label %true, label %false
425+
true:
426+
ret i32 42
427+
false:
428+
ret i32 33
429+
}
430+
431+
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
432+
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
433+
; CHECK: ; %bb.0:
434+
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
435+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
436+
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
437+
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
438+
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
439+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
440+
; CHECK-NEXT: s_cbranch_scc1 .LBB1_2
441+
; CHECK-NEXT: ; %bb.1: ; %true
442+
; CHECK-NEXT: s_mov_b32 s0, 42
443+
; CHECK-NEXT: s_branch .LBB1_3
444+
; CHECK-NEXT: .LBB1_2: ; %false
445+
; CHECK-NEXT: s_mov_b32 s0, 33
446+
; CHECK-NEXT: s_branch .LBB1_3
447+
; CHECK-NEXT: .LBB1_3:
448+
%v1c = icmp ult i32 %v1, 12
449+
%v2c = icmp ugt i32 %v2, 34
450+
%c = and i1 %v1c, %v2c
451+
%ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
452+
%ballot_ne_zero = icmp ne i64 %ballot, 0
453+
br i1 %ballot_ne_zero, label %true, label %false
454+
true:
455+
ret i32 42
456+
false:
457+
ret i32 33
458+
}
459+
460+
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
461+
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
462+
; CHECK: ; %bb.0:
463+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
464+
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
465+
; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
466+
; CHECK-NEXT: s_cbranch_vccnz .LBB2_2
467+
; CHECK-NEXT: ; %bb.1: ; %false
468+
; CHECK-NEXT: s_mov_b32 s0, 33
469+
; CHECK-NEXT: s_branch .LBB2_3
470+
; CHECK-NEXT: .LBB2_2: ; %true
471+
; CHECK-NEXT: s_mov_b32 s0, 42
472+
; CHECK-NEXT: s_branch .LBB2_3
473+
; CHECK-NEXT: .LBB2_3:
474+
%v1c = icmp ult i32 %v1, 12
475+
%v2c = icmp ugt i32 %v2, 34
476+
%c = and i1 %v1c, %v2c
477+
%ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
478+
%ballot_eq_zero = icmp eq i64 %ballot, 0
479+
br i1 %ballot_eq_zero, label %true, label %false
480+
true:
481+
ret i32 42
482+
false:
483+
ret i32 33
484+
}
485+
486+
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
487+
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
488+
; CHECK: ; %bb.0:
489+
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
490+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
491+
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
492+
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
493+
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
494+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
495+
; CHECK-NEXT: s_cbranch_scc1 .LBB3_2
496+
; CHECK-NEXT: ; %bb.1: ; %false
497+
; CHECK-NEXT: s_mov_b32 s0, 33
498+
; CHECK-NEXT: s_branch .LBB3_3
499+
; CHECK-NEXT: .LBB3_2: ; %true
500+
; CHECK-NEXT: s_mov_b32 s0, 42
501+
; CHECK-NEXT: s_branch .LBB3_3
502+
; CHECK-NEXT: .LBB3_3:
503+
%v1c = icmp ult i32 %v1, 12
504+
%v2c = icmp ugt i32 %v2, 34
505+
%c = and i1 %v1c, %v2c
506+
%ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
507+
%ballot_eq_zero = icmp eq i64 %ballot, 0
508+
br i1 %ballot_eq_zero, label %true, label %false
509+
true:
510+
ret i32 42
511+
false:
512+
ret i32 33
513+
}

0 commit comments

Comments
 (0)