-
Notifications
You must be signed in to change notification settings - Fork 14.3k
InstSimplify: lookthru casts, binops in folding shuffles #92668
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1ffc5fe
21e90c0
d99b5b5
bc78925
36d7f84
2fecbeb
cedcc81
c1e7335
a13fb29
041187d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,7 +48,7 @@ using namespace llvm::PatternMatch; | |
|
||
#define DEBUG_TYPE "instsimplify" | ||
|
||
enum { RecursionLimit = 3 }; | ||
static unsigned RecursionLimit = 5; | ||
|
||
STATISTIC(NumExpand, "Number of expansions"); | ||
STATISTIC(NumReassoc, "Number of reassociations"); | ||
|
@@ -5315,55 +5315,107 @@ Value *llvm::simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, | |
return ::simplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit); | ||
} | ||
|
||
using ReplacementTy = std::optional<std::pair<Value *, Value *>>; | ||
|
||
/// For the given destination element of a shuffle, peek through shuffles to | ||
/// match a root vector source operand that contains that element in the same | ||
/// vector lane (ie, the same mask index), so we can eliminate the shuffle(s). | ||
static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1, | ||
int MaskVal, Value *RootVec, | ||
unsigned MaxRecurse) { | ||
static std::pair<Value *, ReplacementTy> | ||
foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1, int MaskVal, | ||
Value *RootVec, unsigned MaxRecurse) { | ||
if (!MaxRecurse--) | ||
return nullptr; | ||
return {nullptr, std::nullopt}; | ||
|
||
// Bail out if any mask value is undefined. That kind of shuffle may be | ||
// simplified further based on demanded bits or other folds. | ||
if (MaskVal == -1) | ||
return nullptr; | ||
return {nullptr, std::nullopt}; | ||
|
||
// The mask value chooses which source operand we need to look at next. | ||
int InVecNumElts = cast<FixedVectorType>(Op0->getType())->getNumElements(); | ||
unsigned InVecNumElts = | ||
cast<FixedVectorType>(Op0->getType())->getNumElements(); | ||
int RootElt = MaskVal; | ||
Value *SourceOp = Op0; | ||
if (MaskVal >= InVecNumElts) { | ||
if (MaskVal > -1 && static_cast<unsigned>(MaskVal) >= InVecNumElts) { | ||
RootElt = MaskVal - InVecNumElts; | ||
SourceOp = Op1; | ||
} | ||
|
||
// The next RootVec preseves an existing RootVec for casts and binops, so that | ||
// the final RootVec is the last cast or binop in the chain. | ||
Value *NextRootVec = RootVec ? RootVec : SourceOp; | ||
|
||
// Look through a cast instruction that preserves number of elements in the | ||
// vector. Set the RootVec to the cast, the SourceOp to the operand and | ||
// recurse. If, in a later stack frame, an appropriate ShuffleVector is | ||
// matched, the example will reduce. | ||
if (auto *SourceCast = dyn_cast<CastInst>(SourceOp)) { | ||
if (auto *CastTy = dyn_cast<FixedVectorType>(SourceCast->getSrcTy())) | ||
if (CastTy->getNumElements() == InVecNumElts) | ||
return foldIdentityShuffles(DestElt, SourceCast->getOperand(0), Op1, | ||
MaskVal, NextRootVec, MaxRecurse); | ||
} | ||
|
||
// Look through a binary operator, with two identical operands. Set the | ||
// RootVec to the binop, the SourceOp to the operand, and recurse. If, in a | ||
// later stack frame, an appropriate ShuffleVector is matched, the example | ||
// will reduce. | ||
Value *BinOpLHS = nullptr, *BinOpRHS = nullptr; | ||
if (match(SourceOp, m_BinOp(m_Value(BinOpLHS), m_Value(BinOpRHS))) && | ||
BinOpLHS == BinOpRHS) | ||
return foldIdentityShuffles(DestElt, BinOpLHS, Op1, MaskVal, NextRootVec, | ||
MaxRecurse); | ||
|
||
// If the source operand is a shuffle itself, look through it to find the | ||
// matching root vector. | ||
if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) { | ||
// Here, we use RootVec, because there is no requirement for finding the | ||
// last shuffle in a chain. In fact, the zeroth operand of the first shuffle | ||
// in the chain will be used as the RootVec for folding. | ||
return foldIdentityShuffles( | ||
DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1), | ||
SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse); | ||
} | ||
|
||
// TODO: Look through bitcasts? What if the bitcast changes the vector element | ||
// size? | ||
|
||
// The source operand is not a shuffle. Initialize the root vector value for | ||
// this shuffle if that has not been done yet. | ||
if (!RootVec) | ||
RootVec = SourceOp; | ||
|
||
// Give up as soon as a source operand does not match the existing root value. | ||
if (RootVec != SourceOp) | ||
return nullptr; | ||
|
||
// The element must be coming from the same lane in the source vector | ||
// (although it may have crossed lanes in intermediate shuffles). | ||
if (RootElt != DestElt) | ||
return nullptr; | ||
return {nullptr, std::nullopt}; | ||
|
||
// If NextRootVec is equal to SourceOp, no replacements are required. Just | ||
// return NextRootVec as the leaf value of the recursion. | ||
if (NextRootVec == SourceOp) | ||
return {NextRootVec, std::nullopt}; | ||
|
||
auto *RootCast = dyn_cast<CastInst>(RootVec); | ||
auto *CastTy = | ||
RootCast ? dyn_cast<FixedVectorType>(RootCast->getSrcTy()) : nullptr; | ||
|
||
// We again have to match the condition for a vector-num-element-preserving | ||
// cast or binop with equal operands, as we are not assured of the recursion | ||
// happening from the call after the previous match. The RootVec was set to | ||
// the last cast or binop in the chain. | ||
if ((CastTy && CastTy->getNumElements() == InVecNumElts) || | ||
(match(RootVec, m_BinOp(m_Value(BinOpLHS), m_Value(BinOpRHS))) && | ||
BinOpLHS == BinOpRHS)) { | ||
// ReplacementSrc should be the User of the the first cast or binop in the | ||
// chain. SourceOp is the reduced value, which should replace | ||
// ReplacementSrc. | ||
if (auto *ReplacementSrc = SourceOp->getUniqueUndroppableUser()) { | ||
// If ReplacementSrc equals RootVec, it means that we didn't recurse after | ||
// matching a cast or binop, and we should terminate the recursion and | ||
// return the leaf value here. | ||
if (ReplacementSrc == RootVec) | ||
return {RootVec, std::nullopt}; | ||
// The User of ReplacementSrc is the first cast or binop in the chain. | ||
// There could be other Users, but we constrain it to a unique User, since | ||
// we perform RAUW later. | ||
if (ReplacementSrc->hasOneUser()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checking for users in InstSimplify feels weird, can you check for hasOneUse on the input instead/ |
||
return {RootVec, std::make_pair(ReplacementSrc, SourceOp)}; | ||
} | ||
} | ||
|
||
return RootVec; | ||
return {nullptr, std::nullopt}; | ||
} | ||
|
||
static Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1, | ||
|
@@ -5468,16 +5520,26 @@ static Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1, | |
// shuffle. This handles simple identity shuffles as well as chains of | ||
// shuffles that may widen/narrow and/or move elements across lanes and back. | ||
Value *RootVec = nullptr; | ||
for (unsigned i = 0; i != MaskNumElts; ++i) { | ||
ReplacementTy ReplacementCandidate; | ||
for (unsigned Idx = 0; Idx != MaskNumElts; ++Idx) { | ||
// Note that recursion is limited for each vector element, so if any element | ||
// exceeds the limit, this will fail to simplify. | ||
RootVec = | ||
foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse); | ||
std::pair<Value *, ReplacementTy> Res = | ||
artagnon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
foldIdentityShuffles(Idx, Op0, Op1, Indices[Idx], RootVec, MaxRecurse); | ||
RootVec = Res.first; | ||
ReplacementCandidate = Res.second; | ||
|
||
// We can't replace a widening/narrowing shuffle with one of its operands. | ||
if (!RootVec || RootVec->getType() != RetTy) | ||
return nullptr; | ||
} | ||
|
||
// Apply any replacements in RootVec that are applicable in case we're looking | ||
// through a cast or binop. | ||
if (ReplacementCandidate) { | ||
auto [ReplacementSrc, ReplacementDst] = *ReplacementCandidate; | ||
ReplacementSrc->replaceAllUsesWith(ReplacementDst); | ||
} | ||
return RootVec; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,26 +6,11 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable | |
define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { | ||
; GCN-LABEL: select_constant_cttz: | ||
; GCN: ; %bb.0: | ||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 | ||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like the point of the test was defeated, so should do something to keep the codegen similar There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what we can do about this. Want to take a stab at making sure the test doesn't get reduced? Sorry, I'm not familiar with AMDGPU. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The AMDGPU part doesn't matter, just make sure whatever simplify now happens, still doesn't happen. It's super frustruating that InstSimplify ends up breaking codegen tests like this. Is this just from the recursive depth increase? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it's due the increase in recursion depth, but we can limit it to the shuffle, as @dtcxzyw suggested. |
||
; GCN-NEXT: s_mov_b32 s3, 0xf000 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_lshr_b32 s4, 1, s2 | ||
; GCN-NEXT: s_cmp_lg_u32 s2, 0 | ||
; GCN-NEXT: s_ff1_i32_b32 s2, s4 | ||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 | ||
; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec | ||
; GCN-NEXT: s_cselect_b32 s2, -1, s2 | ||
; GCN-NEXT: s_flbit_i32 s6, s2 | ||
; GCN-NEXT: s_sub_i32 s8, 31, s6 | ||
; GCN-NEXT: s_cmp_eq_u32 s2, 0 | ||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 | ||
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] | ||
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec | ||
; GCN-NEXT: s_cselect_b32 s4, -1, s8 | ||
; GCN-NEXT: s_mov_b32 s2, -1 | ||
; GCN-NEXT: v_mov_b32_e32 v0, s4 | ||
; GCN-NEXT: v_mov_b32_e32 v0, -1 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||
; GCN-NEXT: s_endpgm | ||
%v = load i32, ptr addrspace(1) %arrayidx, align 4 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@nikic Can you test the compile-time impact?
TBH I don't like increasing the recursion limit for all root instructions.
If necessary, it would be better to only increase the limit for
shufflevector
:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm surprised there's a threshold here that isn't 6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Compile-time: http://llvm-compile-time-tracker.com/compare.php?from=f6527774569790b5a5236f6e84f3f839ce6c2fff&to=4363c34b550a0908edf4ee89eda2fee72a096ba9&stat=instructions:u