Skip to content

Commit 3b25196

Browse files
committed
[CGP] Add support for sinking operands to their users, if they are free.
This patch improves code generation for some AArch64 ACLE intrinsics. It adds support to CGP to duplicate and sink operands to their user, if they can be folded into a target instruction, like zexts and sub into usubl. It adds a TargetLowering hook shouldSinkOperands, which looks at the operands of instructions to see if sinking is profitable. I decided to add a new target hook, as for the sinking to be profitable, at least on AArch64, we have to look at multiple operands of an instruction, instead of looking at the users of a zext for example. The sinking is done in CGP, because it works around an instruction selection limitation. If instruction selection is not limited to a single basic block, this patch should not be needed any longer. Alternatively this could be done in the LoopSink pass, which tries to undo LICM for instructions in blocks that are not executed frequently. Note that we do not force the operands to sink to have a single user, because we duplicate them before sinking. Therefore this is only desirable if they really can be done for free. Additionally we could consider the impact on live ranges later on. This should fix https://bugs.llvm.org/show_bug.cgi?id=40025. As for performance, we have internal code that uses intrinsics and can be speed up by 10% by this change. Reviewers: SjoerdMeijer, t.p.northover, samparker, efriedma, RKSimon, spatel Reviewed By: samparker Differential Revision: https://reviews.llvm.org/D57377 llvm-svn: 353152
1 parent e24b104 commit 3b25196

File tree

5 files changed

+404
-0
lines changed

5 files changed

+404
-0
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2280,6 +2280,16 @@ class TargetLoweringBase {
22802280
return false;
22812281
}
22822282

2283+
/// Return true if sinking I's operands to the same basic block as I is
2284+
/// profitable, e.g. because the operands can be folded into a target
2285+
/// instruction during instruction selection. After calling the function
2286+
/// \p Ops contains the Uses to sink ordered by dominance (dominating users
2287+
/// come first).
2288+
virtual bool shouldSinkOperands(Instruction *I,
2289+
SmallVectorImpl<Use *> &Ops) const {
2290+
return false;
2291+
}
2292+
22832293
/// Return true if the target supplies and combines to a paired load
22842294
/// two loaded values of type LoadedType next to each other in memory.
22852295
/// RequiredAlignment gives the minimal alignment constraints that must be met

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,8 @@ class TypePromotionTransaction;
375375
SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
376376
bool splitBranchCondition(Function &F);
377377
bool simplifyOffsetableRelocate(Instruction &I);
378+
379+
bool tryToSinkFreeOperands(Instruction *I);
378380
};
379381

380382
} // end anonymous namespace
@@ -1752,6 +1754,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
17521754
InsertedInsts.insert(ExtVal);
17531755
return true;
17541756
}
1757+
17551758
case Intrinsic::launder_invariant_group:
17561759
case Intrinsic::strip_invariant_group: {
17571760
Value *ArgVal = II->getArgOperand(0);
@@ -5973,6 +5976,48 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
59735976
return MadeChange;
59745977
}
59755978

5979+
bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
5980+
// If the operands of I can be folded into a target instruction together with
5981+
// I, duplicate and sink them.
5982+
SmallVector<Use *, 4> OpsToSink;
5983+
if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink))
5984+
return false;
5985+
5986+
// OpsToSink can contain multiple uses in a use chain (e.g.
5987+
// (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
5988+
// uses must come first, which means they are sunk first, temporarily creating
5989+
// invalid IR. This will be fixed once their dominated users are sunk and
5990+
// updated.
5991+
BasicBlock *TargetBB = I->getParent();
5992+
bool Changed = false;
5993+
SmallVector<Use *, 4> ToReplace;
5994+
for (Use *U : OpsToSink) {
5995+
auto *UI = cast<Instruction>(U->get());
5996+
if (UI->getParent() == TargetBB || isa<PHINode>(UI))
5997+
continue;
5998+
ToReplace.push_back(U);
5999+
}
6000+
6001+
SmallPtrSet<Instruction *, 4> MaybeDead;
6002+
for (Use *U : ToReplace) {
6003+
auto *UI = cast<Instruction>(U->get());
6004+
Instruction *NI = UI->clone();
6005+
MaybeDead.insert(UI);
6006+
LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
6007+
NI->insertBefore(I);
6008+
InsertedInsts.insert(NI);
6009+
U->set(NI);
6010+
Changed = true;
6011+
}
6012+
6013+
// Remove instructions that are dead after sinking.
6014+
for (auto *I : MaybeDead)
6015+
if (!I->hasNUsesOrMore(1))
6016+
I->eraseFromParent();
6017+
6018+
return Changed;
6019+
}
6020+
59766021
bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
59776022
if (!TLI || !DL)
59786023
return false;
@@ -6787,6 +6832,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
67876832
return false;
67886833
}
67896834

6835+
if (tryToSinkFreeOperands(I))
6836+
return true;
6837+
67906838
if (CallInst *CI = dyn_cast<CallInst>(I))
67916839
return optimizeCallInst(CI, ModifiedDT);
67926840

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,11 @@
5454
#include "llvm/IR/IRBuilder.h"
5555
#include "llvm/IR/Instruction.h"
5656
#include "llvm/IR/Instructions.h"
57+
#include "llvm/IR/IntrinsicInst.h"
5758
#include "llvm/IR/Intrinsics.h"
5859
#include "llvm/IR/Module.h"
5960
#include "llvm/IR/OperandTraits.h"
61+
#include "llvm/IR/PatternMatch.h"
6062
#include "llvm/IR/Type.h"
6163
#include "llvm/IR/Use.h"
6264
#include "llvm/IR/Value.h"
@@ -86,6 +88,7 @@
8688
#include <vector>
8789

8890
using namespace llvm;
91+
using namespace llvm::PatternMatch;
8992

9093
#define DEBUG_TYPE "aarch64-lower"
9194

@@ -8270,6 +8273,110 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
82708273
return true;
82718274
}
82728275

8276+
/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
8277+
/// or upper half of the vector elements.
8278+
static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
8279+
auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
8280+
auto *FullVT = cast<VectorType>(FullV->getType());
8281+
auto *HalfVT = cast<VectorType>(HalfV->getType());
8282+
return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
8283+
};
8284+
8285+
auto extractHalf = [](Value *FullV, Value *HalfV) {
8286+
auto *FullVT = cast<VectorType>(FullV->getType());
8287+
auto *HalfVT = cast<VectorType>(HalfV->getType());
8288+
return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
8289+
};
8290+
8291+
Constant *M1, *M2;
8292+
Value *S1Op1, *S2Op1;
8293+
if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
8294+
!match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
8295+
return false;
8296+
8297+
// Check that the operands are half as wide as the result and we extract
8298+
// half of the elements of the input vectors.
8299+
if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
8300+
!extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
8301+
return false;
8302+
8303+
// Check the mask extracts either the lower or upper half of vector
8304+
// elements.
8305+
int M1Start = -1;
8306+
int M2Start = -1;
8307+
int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
8308+
if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
8309+
!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
8310+
M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
8311+
return false;
8312+
8313+
return true;
8314+
}
8315+
8316+
/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
8317+
/// of the vector elements.
8318+
static bool areExtractExts(Value *Ext1, Value *Ext2) {
8319+
auto areExtDoubled = [](Instruction *Ext) {
8320+
return Ext->getType()->getScalarSizeInBits() ==
8321+
2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
8322+
};
8323+
8324+
if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
8325+
!match(Ext2, m_ZExtOrSExt(m_Value())) ||
8326+
!areExtDoubled(cast<Instruction>(Ext1)) ||
8327+
!areExtDoubled(cast<Instruction>(Ext2)))
8328+
return false;
8329+
8330+
return true;
8331+
}
8332+
8333+
/// Check if sinking \p I's operands to I's basic block is profitable, because
8334+
/// the operands can be folded into a target instruction, e.g.
8335+
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
8336+
bool AArch64TargetLowering::shouldSinkOperands(
8337+
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
8338+
if (!I->getType()->isVectorTy())
8339+
return false;
8340+
8341+
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
8342+
switch (II->getIntrinsicID()) {
8343+
case Intrinsic::aarch64_neon_umull:
8344+
if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
8345+
return false;
8346+
Ops.push_back(&II->getOperandUse(0));
8347+
Ops.push_back(&II->getOperandUse(1));
8348+
return true;
8349+
default:
8350+
return false;
8351+
}
8352+
}
8353+
8354+
switch (I->getOpcode()) {
8355+
case Instruction::Sub:
8356+
case Instruction::Add: {
8357+
if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
8358+
return false;
8359+
8360+
// If the exts' operands extract either the lower or upper elements, we
8361+
// can sink them too.
8362+
auto Ext1 = cast<Instruction>(I->getOperand(0));
8363+
auto Ext2 = cast<Instruction>(I->getOperand(1));
8364+
if (areExtractShuffleVectors(Ext1, Ext2)) {
8365+
Ops.push_back(&Ext1->getOperandUse(0));
8366+
Ops.push_back(&Ext2->getOperandUse(0));
8367+
}
8368+
8369+
Ops.push_back(&I->getOperandUse(0));
8370+
Ops.push_back(&I->getOperandUse(1));
8371+
8372+
return true;
8373+
}
8374+
default:
8375+
return false;
8376+
}
8377+
return false;
8378+
}
8379+
82738380
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
82748381
unsigned &RequiredAligment) const {
82758382
if (!LoadedType.isSimple() ||

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,9 @@ class AArch64TargetLowering : public TargetLowering {
327327
bool isZExtFree(EVT VT1, EVT VT2) const override;
328328
bool isZExtFree(SDValue Val, EVT VT2) const override;
329329

330+
bool shouldSinkOperands(Instruction *I,
331+
SmallVectorImpl<Use *> &Ops) const override;
332+
330333
bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
331334

332335
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

0 commit comments

Comments
 (0)