Skip to content

Improve selection of conditional branch on amdgcn.ballot!=0 condition in SelectionDAG. #68714

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 6, 2023
Merged
72 changes: 70 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600RegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
Expand Down Expand Up @@ -2263,6 +2264,34 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
return false;
}

static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
// Special case for amdgcn.ballot:
// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
// =>
// Use i1 %Cond value instead of i(WaveSize) %VCMP.
// This is possible because divergent ISD::SETCC is selected as V_CMP and
// Cond becomes a i(WaveSize) full mask value.
// Note that ballot doesn't use SETEQ condition but its easy to support it
// here for completeness, so in this case Negate is set true on return.
auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && VCMP_CRHS &&
VCMP_CRHS->isZero()) {

auto Cond = VCMP.getOperand(0);
if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
Cond = Cond.getOperand(0);

if (isBoolSGPR(Cond)) {
Negate = VCMP_CC == ISD::SETEQ;
return Cond;
}
}
return SDValue();
}

void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
SDValue Cond = N->getOperand(1);

Expand All @@ -2276,11 +2305,50 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
const SIRegisterInfo *TRI = ST->getRegisterInfo();

bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
bool AndExec = !UseSCCBr;
bool Negate = false;

if (Cond.getOpcode() == ISD::SETCC &&
Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
SDValue VCMP = Cond->getOperand(0);
auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() &&
// TODO: make condition below an assert after fixing ballot bitwidth.
VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
// BRCOND i1 %C, %BB
// =>
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
// VCC = COPY i(WaveSize) %VCMP
// S_CBRANCH_VCCNZ/VCCZ %BB
Negate = CC == ISD::SETEQ;
bool NegatedBallot = false;
if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
Cond = BallotCond;
UseSCCBr = !BallotCond->isDivergent();
Negate = Negate ^ NegatedBallot;
} else {
// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
// selected as V_CMP, but this may change for uniform condition.
Cond = VCMP;
UseSCCBr = false;
}
}
// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
// V_CMPs resulted from ballot or ballot has uniform condition and SCC is
// used.
AndExec = false;
}

unsigned BrOp =
UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);

if (!UseSCCBr) {
if (AndExec) {
// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
// analyzed what generates the vcc value, so we do not know whether vcc
// bits for disabled lanes are 0. Thus we need to mask out bits for
Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10628,9 +10628,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
return SDValue();
}

// Returns true if argument is a boolean value which is not serialized into
// memory or argument and does not require v_cndmask_b32 to be deserialized.
static bool isBoolSGPR(SDValue V) {
bool llvm::isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
getTargetMMOFlags(const Instruction &I) const override;
};

// Returns true if argument is a boolean value which is not serialized into
// memory or argument and does not require v_cndmask_b32 to be deserialized.
bool isBoolSGPR(SDValue V);

} // End namespace llvm

#endif
Loading