-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP] Match poison as instruction with the same opcode #115946
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP] Match poison as instruction with the same opcode #115946
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesPatch allows to vector scalar instruction + poison values as if poisons AVX512, -O3 + LTO JM/ldecod - better vector code Patch is 50.97 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115946.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1bf082d57b8bb0..bbc2560778541d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -456,15 +456,18 @@ static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
+ auto *It = find_if(VL, IsaPred<Instruction>);
+ if (It == VL.end())
return false;
+ Instruction *I0 = cast<Instruction>(*It);
if (all_of(VL, isVectorLikeInstWithConstOps))
return true;
BasicBlock *BB = I0->getParent();
- for (int I = 1, E = VL.size(); I < E; I++) {
- auto *II = dyn_cast<Instruction>(VL[I]);
+ for (Value *V : iterator_range(It, VL.end())) {
+ if (isa<PoisonValue>(V))
+ continue;
+ auto *II = dyn_cast<Instruction>(V);
if (!II)
return false;
@@ -893,10 +896,19 @@ static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI) {
// Make sure these are all Instructions.
- if (!all_of(VL, IsaPred<Instruction>))
+ if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
+ return InstructionsState::invalid();
+
+ auto *It = find_if(VL, IsaPred<Instruction>);
+ if (It == VL.end())
+ return InstructionsState::invalid();
+
+ Value *V = *It;
+ unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
+ if ((VL.size() > 2 && !isa<PHINode>(V) && InstCnt < VL.size() / 2) ||
+ (VL.size() == 2 && InstCnt < 2))
return InstructionsState::invalid();
- Value *V = VL.front();
bool IsCastOp = isa<CastInst>(V);
bool IsBinOp = isa<BinaryOperator>(V);
bool IsCmpOp = isa<CmpInst>(V);
@@ -904,7 +916,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
IsCmpOp ? cast<CmpInst>(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE;
unsigned Opcode = cast<Instruction>(V)->getOpcode();
unsigned AltOpcode = Opcode;
- unsigned AltIndex = 0;
+ unsigned AltIndex = std::distance(VL.begin(), It);
bool SwappedPredsCompatible = [&]() {
if (!IsCmpOp)
@@ -940,8 +952,17 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
return InstructionsState::invalid();
}
+ bool AnyPoison = InstCnt != VL.size();
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
- auto *I = cast<Instruction>(VL[Cnt]);
+ auto *I = dyn_cast<Instruction>(VL[Cnt]);
+ if (!I)
+ continue;
+
+ // Cannot combine poison and divisions.
+ if (AnyPoison && (I->isIntDivRem() || I->isArithmeticShift() ||
+ I->getOpcode() == Instruction::FDiv ||
+ I->getOpcode() == Instruction::FRem || isa<CallInst>(I)))
+ return InstructionsState::invalid();
unsigned InstOpcode = I->getOpcode();
if (IsBinOp && isa<BinaryOperator>(I)) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
@@ -1177,10 +1198,13 @@ static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
Type *ScalarTy = VL[0]->getType();
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
- for (unsigned Lane : seq<unsigned>(VL.size()))
+ for (unsigned Lane : seq<unsigned>(VL.size())) {
+ if (isa<PoisonValue>(VL[Lane]))
+ continue;
if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
OpcodeMask.set(Lane * ScalarTyNumElements,
Lane * ScalarTyNumElements + ScalarTyNumElements);
+ }
return OpcodeMask;
}
@@ -1781,13 +1805,17 @@ class BoUpSLP {
(S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
!S.isAltShuffle()) &&
all_of(Ops, [&S](Value *V) {
- return cast<Instruction>(V)->getNumOperands() ==
- S.MainOp->getNumOperands();
+ return isa<PoisonValue>(V) ||
+ cast<Instruction>(V)->getNumOperands() ==
+ S.MainOp->getNumOperands();
}))
return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
: LookAheadHeuristics::ScoreSameOpcode;
}
+ if (I1 && isa<PoisonValue>(V2))
+ return LookAheadHeuristics::ScoreSameOpcode;
+
if (isa<UndefValue>(V2))
return LookAheadHeuristics::ScoreUndef;
@@ -2336,17 +2364,17 @@ class BoUpSLP {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
- assert(isa<Instruction>(VL[0]) && "Expected instruction");
constexpr unsigned IntrinsicNumOperands = 2;
- unsigned NumOperands = isa<IntrinsicInst>(VL[0])
- ? IntrinsicNumOperands
- : cast<Instruction>(VL[0])->getNumOperands();
+ auto *VL0 = cast<Instruction>(*find_if(VL, IsaPred<Instruction>));
+ unsigned NumOperands = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands
+ : VL0->getNumOperands();
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
OpsVec[OpIdx].resize(NumLanes);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
+ assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
+ "Expected instruction or poison value");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
@@ -2357,6 +2385,12 @@ class BoUpSLP {
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely
// tell the inverse operations by checking commutativity.
+ if (isa<PoisonValue>(VL[Lane])) {
+ OpsVec[OpIdx][Lane] = {
+ PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+ false};
+ continue;
+ }
bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
@@ -2451,7 +2485,7 @@ class BoUpSLP {
Value *OpILn = getValue(OpI, Ln);
return (L && L->isLoopInvariant(OpILn)) ||
(getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
- Op->getParent() == cast<Instruction>(OpILn)->getParent());
+ allSameBlock({Op, OpILn}));
}))
return true;
}
@@ -2463,7 +2497,8 @@ class BoUpSLP {
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
L(R.LI->getLoopFor(
- (cast<Instruction>(RootVL.front())->getParent()))) {
+ (cast<Instruction>(*find_if(RootVL, IsaPred<Instruction>))
+ ->getParent()))) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
@@ -3265,13 +3300,18 @@ class BoUpSLP {
/// Set the operands of this bundle in their original order.
void setOperandsInOrder() {
assert(Operands.empty() && "Already initialized?");
- auto *I0 = cast<Instruction>(Scalars[0]);
+ auto *I0 = cast<Instruction>(*find_if(Scalars, IsaPred<Instruction>));
Operands.resize(I0->getNumOperands());
unsigned NumLanes = Scalars.size();
for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
OpIdx != NumOperands; ++OpIdx) {
Operands[OpIdx].resize(NumLanes);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ if (isa<PoisonValue>(Scalars[Lane])) {
+ Operands[OpIdx][Lane] =
+ PoisonValue::get(I0->getOperand(OpIdx)->getType());
+ continue;
+ }
auto *I = cast<Instruction>(Scalars[Lane]);
assert(I->getNumOperands() == NumOperands &&
"Expected same number of operands");
@@ -4891,8 +4931,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
PointerOps.resize(Sz);
auto *POIter = PointerOps.begin();
for (Value *V : VL) {
- auto *L = cast<LoadInst>(V);
- if (!L->isSimple())
+ auto *L = dyn_cast<LoadInst>(V);
+ if (!L || !L->isSimple())
return LoadsState::Gather;
*POIter = L->getPointerOperand();
++POIter;
@@ -5470,6 +5510,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
TE.ReuseShuffleIndices.end());
if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
all_of(TE.Scalars, [Sz](Value *V) {
+ if (isa<PoisonValue>(V))
+ return true;
std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
return Idx && *Idx < Sz;
})) {
@@ -5554,7 +5596,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
auto PHICompare = [&](unsigned I1, unsigned I2) {
Value *V1 = TE.Scalars[I1];
Value *V2 = TE.Scalars[I2];
- if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
+ if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) ||
+ isa<PoisonValue>(V1) || isa<PoisonValue>(V2))
return false;
if (V1->getNumUses() < V2->getNumUses())
return true;
@@ -7319,8 +7362,14 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
Operands.emplace_back();
// Prepare the operand vector.
- for (Value *V : VL)
+ for (Value *V : VL) {
+ if (isa<PoisonValue>(V)) {
+ Operands.back().push_back(
+ PoisonValue::get(S.MainOp->getOperand(I)->getType()));
+ continue;
+ }
Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
+ }
}
if (Operands.size() == 2) {
// Try find best operands candidates.
@@ -7427,8 +7476,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
if (VL0->getNumOperands() > MaxPHINumOperands)
return TreeEntry::NeedToGather;
// Check for terminator values (e.g. invoke).
- for (Value *V : VL)
- for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
+ for (Value *V : VL) {
+ auto *PHI = dyn_cast<PHINode>(V);
+ if (!PHI)
+ continue;
+ for (Value *Incoming : PHI->incoming_values()) {
Instruction *Term = dyn_cast<Instruction>(Incoming);
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
@@ -7436,6 +7488,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::NeedToGather;
}
}
+ }
return TreeEntry::Vectorize;
}
@@ -7511,8 +7564,10 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
- else if (any_of(VL,
- [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
+ else if (any_of(VL, [](Value *V) {
+ auto *LI = dyn_cast<LoadInst>(V);
+ return !LI || !LI->isSimple();
+ }))
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
else
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -7536,6 +7591,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
for (Value *V : VL) {
+ if (isa<PoisonValue>(V))
+ continue;
Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
LLVM_DEBUG(
@@ -7552,7 +7609,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
Type *ComparedTy = VL0->getOperand(0)->getType();
for (Value *V : VL) {
- CmpInst *Cmp = cast<CmpInst>(V);
+ if (isa<PoisonValue>(V))
+ continue;
+ auto *Cmp = cast<CmpInst>(V);
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
@@ -7795,7 +7854,13 @@ class PHIHandler {
}
// Prepare the operand vector.
for (auto [Idx, V] : enumerate(Phis)) {
- auto *P = cast<PHINode>(V);
+ auto *P = dyn_cast<PHINode>(V);
+ if (!P) {
+ assert(isa<PoisonValue>(V) &&
+ "Expected isa instruction or poison value.");
+ Operands[I][Idx] = V;
+ continue;
+ }
if (P->getIncomingBlock(I) == InBB)
Operands[I][Idx] = P->getIncomingValue(I);
else
@@ -7814,6 +7879,11 @@ class PHIHandler {
Blocks.try_emplace(InBB).first->second.push_back(I);
}
for (auto [Idx, V] : enumerate(Phis)) {
+ if (isa<PoisonValue>(V)) {
+ for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
+ Operands[I][Idx] = V;
+ continue;
+ }
auto *P = cast<PHINode>(V);
for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
BasicBlock *InBB = P->getIncomingBlock(I);
@@ -7863,7 +7933,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *V : VL) {
if (isConstant(V)) {
ReuseShuffleIndices.emplace_back(
- isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
+ isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
UniqueValues.emplace_back(V);
continue;
}
@@ -7895,11 +7965,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}))) {
if (DoNotFail && UniquePositions.size() > 1 &&
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
- all_of(UniqueValues, [=](Value *V) {
- return isa<ExtractElementInst>(V) ||
- areAllUsersVectorized(cast<Instruction>(V),
- UserIgnoreList);
- })) {
+ all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
// Find the number of elements, which forms full vectors.
unsigned PWSz = getFullVectorNumberOfElements(
*TTI, UniqueValues.front()->getType(), UniqueValues.size());
@@ -7907,8 +7973,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ReuseShuffleIndices.clear();
} else {
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
- NonUniqueValueVL.append(PWSz - UniqueValues.size(),
- UniqueValues.back());
+ NonUniqueValueVL.append(
+ PWSz - UniqueValues.size(),
+ PoisonValue::get(UniqueValues.front()->getType()));
VL = NonUniqueValueVL;
}
return true;
@@ -8043,7 +8110,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return true;
// Check if all operands are extracts, part of vector node or can build a
// regular vectorize node.
- SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
+ SmallVector<unsigned, 8> InstsCount;
for (Value *V : VL) {
auto *I = cast<Instruction>(V);
InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
@@ -8437,6 +8504,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
+ if (isa<PoisonValue>(V)) {
+ Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
+ Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
+ continue;
+ }
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
@@ -8636,7 +8708,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (isa<BinaryOperator>(VL0) || CI) {
ValueList Left, Right;
if (!CI || all_of(VL, [](Value *V) {
- return cast<CmpInst>(V)->isCommutative();
+ return isa<PoisonValue>(V) || cast<CmpInst>(V)->isCommutative();
})) {
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
} else {
@@ -8649,6 +8721,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Collect operands - commute if it uses the swapped predicate or
// alternate operation.
for (Value *V : VL) {
+ if (isa<PoisonValue>(V)) {
+ Left.push_back(
+ PoisonValue::get(MainCI->getOperand(0)->getType()));
+ Right.push_back(
+ PoisonValue::get(MainCI->getOperand(1)->getType()));
+ continue;
+ }
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
@@ -8853,6 +8932,8 @@ void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
unsigned Idx = I;
if (!ReorderIndices.empty())
Idx = OrderMask[I];
+ if (isa<PoisonValue>(Scalars[Idx]))
+ continue;
auto *OpInst = cast<Instruction>(Scalars[Idx]);
if (IsAltOp(OpInst)) {
Mask[I] = Sz + Idx;
@@ -9627,9 +9708,11 @@ void BoUpSLP::transformNodes() {
// Try to vectorize reduced values or if all users are vectorized.
// For expensive instructions extra extracts might be profitable.
if ((!UserIgnoreList || E.Idx != 0) &&
- TTI->getInstructionCost(cast<Instruction>(Slice.front()),
- CostKind) < TTI::TCC_Expensive &&
+ TTI->getInstructionCost(S.MainOp, CostKind) <
+ TTI::TCC_Expensive &&
!all_of(Slice, [&](Value *V) {
+ if (isa<PoisonValue>(V))
+ return true;
return areAllUsersVectorized(cast<Instruction>(V),
UserIgnoreList);
}))
@@ -9652,12 +9735,13 @@ void BoUpSLP::transformNodes() {
continue;
}
} else if (S.getOpcode() == Instruction::ExtractElement ||
- (TTI->getInstructionCost(
- cast<Instruction>(Slice.front()), CostKind) <
+ (TTI->getInstructionCost(S.MainOp, CostKind) <
TTI::TCC_Expensive &&
!CheckOperandsProfitability(
- cast<Instruction>(Slice.front()),
- cast<Instruction>(Slice.back()), S))) {
+ S.MainOp,
+ cast<Instruction>(*find_if(reverse(Slice),
+ IsaPred<Instruction>)),
+ S))) {
// Do not vectorize extractelements (handled effectively
// alread). Do not vectorize non-profitable instructions (with
// low cost and non-vectorizable operands.)
@@ -10887,7 +10971,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
const unsigned Sz = UniqueValues.size();
SmallBitVector UsedScalars(Sz, false);
for (unsigned ...
[truncated]
|
continue; | ||
|
||
// Cannot combine poison and divisions. | ||
if (AnyPoison && (I->isIntDivRem() || I->isArithmeticShift() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AShr is cheap and it doesn't cause immediate UB.
Created using spr 1.3.5
Ping! |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
We're hitting an assert after this:
See https://crbug.com/380805017#comment4 for a reproducer. Can you please take a look? |
Looking, will fix ASAP |
Thanks! In case it's useful, creduce just finished with a smaller repro here: https://crbug.com/380805017#comment6 |
Hmm, tried both reproducers (the original and reduced), unable to reproduce using top trunk. Can you recheck, please? |
You're right, it looks like you fixed it with f953b5e already. Sorry for the noise! |
Patch allows to vector scalar instruction + poison values as if poisons are instructions with the same opcode. It allows better vectorization of the repeated values, reduces number of insertelement instructions and serves as a base ground for copyable elements vectorization AVX512, -O3 + LTO JM/ldecod - better vector code Applications/oggenc - better vectorization CINT2017speed/625.x264_s CINT2017rate/525.x264_r - better vector code CFP2017rate/526.blender_r - better vector code CFP2006/447.dealII - small variations Benchmarks/Bullet - extra vector code CFP2017rate/510.parest_r - better vectorization CINT2017rate/502.gcc_r CINT2017speed/602.gcc_s - extra vector code Benchmarks/tramp3d-v4 - small variations CFP2006/453.povray - extra vector code JM/lencod - better vector code CFP2017rate/511.povray_r - extra vector code MemFunctions/MemFunctions - extra vector code LoopVectorization/LoopVectorizationBenchmarks - extra vector code XRay/FDRMode - extra vector code XRay/ReturnReference - extra vector code LCALS/SubsetCLambdaLoops - extra vector code LCALS/SubsetCRawLoops - extra vector code LCALS/SubsetARawLoops - extra vector code LCALS/SubsetALambdaLoops - extra vector code DOE-ProxyApps-C++/miniFE - extra vector code LoopVectorization/LoopInterleavingBenchmarks - extra vector code LCALS/SubsetBLambdaLoops - extra vector code MicroBenchmarks/harris - extra vector code ImageProcessing/Dither - extra vector code MicroBenchmarks/SLPVectorization - extra vector code ImageProcessing/Blur - extra vector code ImageProcessing/Dilate - extra vector code Builtins/Int128 - extra vector code ImageProcessing/Interpolation - extra vector code ImageProcessing/BilateralFiltering - extra vector code ImageProcessing/AnisotropicDiffusion - extra vector code MicroBenchmarks/LoopInterchange - extra code vectorized LCALS/SubsetBRawLoops - extra code vectorized CINT2006/464.h264ref - extra vectorization with wider vectors CFP2017rate/508.namd_r - small variations, extra phis vectorized CFP2006/444.namd - 2 2 x phi replaced by 4 x phi DOE-ProxyApps-C/SimpleMOC - extra code vectorized CINT2017rate/541.leela_r CINT2017speed/641.leela_s - the function better vectorized and inlined Benchmarks/Misc/oourafft - 2 4 x bit reductions replaced by 2 x vector code FreeBench/fourinarow - better vectorization Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#115946 Change-Id: Ie96d4cd3ff69dd1efe59ff56a9a7393fed43d3aa
This commit (...somehow...) broke Triton/PyTorch on CUDA (see triton-lang/triton#5591 for a description). I have attached the failing and passing LLIR: failing_triton_poi_fused_33.llir.txt passing_triton_poi_fused_33.llir.txt @alexey-bataev any guidance would be appreciated (I am not familiar with the SLP vectorizer....). |
I have "bisected" the current CUDA regression (mentioned [here](#5341)) on A100: ``` git bisect start # bad: [f5a35a31bfe6cbc16bec0c130f2bb3632dbf1fbf] [LV] Add test cases with incorrect IV live-outs. git bisect bad f5a35a31bfe6cbc16bec0c130f2bb3632dbf1fbf # good: [86b69c31642e98f8357df62c09d118ad1da4e16a] [SPIR-V] Fix SPIR-V extension SPV_INTEL_function_pointers: ... # first bad commit: [b8703369daf777706196ff914c0376c27adde3cf] [SLP] Match poison as instruction with the same opcode ``` The failure presents as : ``` Traceback (most recent call last): File "/home/mleventa/dev_projects/repro/repro.py", line 135, in <module> torch.cuda.synchronize() File "/home/mleventa/dev_projects/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 987, in synchronize return torch._C._cuda_synchronize() RuntimeError: CUDA error: an illegal memory access was encountered ``` The failing commit is [`b870336`](llvm/llvm-project#115946) and the last working commit is `2fe947b4`. It's not clear to me the relationship but will investigate. This PR to `llvm-head` is to verify that my repro on A100 matches the internal regression/fail.
Try to check the latest head. I don't quite understand what is the problem. What shall I do to reproduce the issue? |
Yes, need IR before SLP vectorizer |
The only change here is due to deprecation of `applyPatternsAndFoldGreedily`. Note, the is the last commit before llvm/llvm-project#115946 which leads to IMAs.
@alexey-bataev here are the logs from log_passing.txt Note, they are fairly large (~10Mb) but I thought it would better to include everything just to be safe. |
@alexey-bataev ah sorry - the initial dumps I pasted weren't at the right point in the pass pipeline; those different stores actually appear at the end of |
What is the target triple, nvptx64-unknown-unknown? |
We/Triton leaves it "empty" for internal reasons that I currently don't have context on https://github.com/triton-lang/triton/blob/main/python/src/llvm.cc#L333-L339. |
@alexey-bataev actually sorry let me double check that (that comment might be stale). EDIT: no that's correct - target is left empty |
Try to update the compiler to most recent version. I see an incorrect vectorization result in the failed version, which was fixed already. |
I/we've tried |
This one is Dec 30, I fixed a bug earlier today |
Excellent - in my local build the illegal access is gone. I'll come back if Triton internal fails but I think we're good. Thank you for your help! |
The only change here is due to deprecation of `applyPatternsAndFoldGreedily`. Note, the is the last commit before llvm/llvm-project#115946 which leads to IMAs.
The only change here is due to deprecation of `applyPatternsAndFoldGreedily`. Note, the is the last commit before llvm/llvm-project#115946 which leads to IMAs.
Patch allows to vector scalar instruction + poison values as if poisons
are instructions with the same opcode. It allows better vectorization of
the repeated values, reduces number of insertelement instructions and
serves as a base ground for copyable elements vectorization
AVX512, -O3 + LTO
JM/ldecod - better vector code
Applications/oggenc - better vectorization
CINT2017speed/625.x264_s
CINT2017rate/525.x264_r - better vector code
CFP2017rate/526.blender_r - better vector code
CFP2006/447.dealII - small variations
Benchmarks/Bullet - extra vector code
CFP2017rate/510.parest_r - better vectorization
CINT2017rate/502.gcc_r
CINT2017speed/602.gcc_s - extra vector code
Benchmarks/tramp3d-v4 - small variations
CFP2006/453.povray - extra vector code
JM/lencod - better vector code
CFP2017rate/511.povray_r - extra vector code
MemFunctions/MemFunctions - extra vector code
LoopVectorization/LoopVectorizationBenchmarks - extra vector code
XRay/FDRMode - extra vector code
XRay/ReturnReference - extra vector code
LCALS/SubsetCLambdaLoops - extra vector code
LCALS/SubsetCRawLoops - extra vector code
LCALS/SubsetARawLoops - extra vector code
LCALS/SubsetALambdaLoops - extra vector code
DOE-ProxyApps-C++/miniFE - extra vector code
LoopVectorization/LoopInterleavingBenchmarks - extra vector code
LCALS/SubsetBLambdaLoops - extra vector code
MicroBenchmarks/harris - extra vector code
ImageProcessing/Dither - extra vector code
MicroBenchmarks/SLPVectorization - extra vector code
ImageProcessing/Blur - extra vector code
ImageProcessing/Dilate - extra vector code
Builtins/Int128 - extra vector code
ImageProcessing/Interpolation - extra vector code
ImageProcessing/BilateralFiltering - extra vector code
ImageProcessing/AnisotropicDiffusion - extra vector code
MicroBenchmarks/LoopInterchange - extra code vectorized
LCALS/SubsetBRawLoops - extra code vectorized
CINT2006/464.h264ref - extra vectorization with wider vectors
CFP2017rate/508.namd_r - small variations, extra phis vectorized
CFP2006/444.namd - 2 2 x phi replaced by 4 x phi
DOE-ProxyApps-C/SimpleMOC - extra code vectorized
CINT2017rate/541.leela_r
CINT2017speed/641.leela_s - the function better vectorized and inlined
Benchmarks/Misc/oourafft - 2 4 x bit reductions replaced by 2 x vector code
FreeBench/fourinarow - better vectorization