Skip to content

Commit ae7244a

Browse files
committed
[InstCombine] Transform high latency, dependent FSQRT/FDIV into FMUL
The proposed patch, in general, tries to transform the below code sequence: x = 1.0 / sqrt (a); r1 = x * x; // same as 1.0 / a r2 = a / sqrt(a); // same as sqrt (a) TO (If x, r1 and r2 are all used further in the code) tmp1 = 1.0 / a tmp2 = sqrt (a) tmp3 = tmp1 * tmp2 x = tmp3 r1 = tmp1 r2 = tmp2 The transform tries to make high latency sqrt and div operations independent and also saves on one multiplication. The patch was tested with SPEC17 suite with cpu=neoverse-v2. The performance uplift achieved was: 544.nab_r ~4% No other regressions were observed. Also, no compile time differences were observed with the patch. Closes #54652
1 parent e05c1b4 commit ae7244a

File tree

2 files changed

+602
-0
lines changed

2 files changed

+602
-0
lines changed

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,100 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
626626
return nullptr;
627627
}
628628

629+
static bool isFSqrtDivToFMulLegal(Instruction *X,
630+
const SmallVectorImpl<Instruction *> &R1,
631+
const SmallVectorImpl<Instruction *> &R2) {
632+
BasicBlock *BBx = X->getParent();
633+
BasicBlock *BBr1 = R1[0]->getParent();
634+
BasicBlock *BBr2 = R2[0]->getParent();
635+
// Check the constaints on instruction X.
636+
auto XConstraintsSatisfied = [X]() {
637+
// We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed
638+
// by recip fp as it is strictly meant to transform ops of type a/b to
639+
// a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag
640+
// has been used(rather abused)in the past for algebraic rewrites.
641+
return X->hasAllowReassoc();
642+
};
643+
if (!XConstraintsSatisfied())
644+
return false;
645+
646+
// Check the constraints on instructions in R1.
647+
auto R1ConstraintsSatisfied = [BBr1](Instruction *I) {
648+
// When you have multiple instructions residing in R1 and R2 respectively,
649+
// it's difficult to generate combinations of (R1,R2) and then check if we
650+
// have the required pattern. So, for now, just be conservative.
651+
if (I->getParent() != BBr1)
652+
return false;
653+
if (!I->hasNUsesOrMore(1))
654+
return false;
655+
// The optimization tries to convert
656+
// R1 = div * div where, div = 1/sqrt(a)
657+
// to
658+
// R1 = 1/a
659+
// Now, this simplication does not work because sqrt(a)=NaN when a<0
660+
if (!I->hasNoNaNs())
661+
return false;
662+
// sqrt(-0.0) = -0.0, and doing this simplication would change the sign of
663+
// the result.
664+
return I->hasNoSignedZeros() && I->hasAllowReassoc();
665+
};
666+
if (!all_of(R1, R1ConstraintsSatisfied))
667+
return false;
668+
669+
// Check the constraints on instructions in R2.
670+
auto R2ConstraintsSatisfied = [BBr2](Instruction *I) {
671+
// When you have multiple instructions residing in R1 and R2 respectively,
672+
// it's difficult to generate combination of (R1,R2) and then check if we
673+
// have the required pattern. So, for now, just be conservative.
674+
if (I->getParent() != BBr2)
675+
return false;
676+
if (!I->hasNUsesOrMore(1))
677+
return false;
678+
// This simplication changes
679+
// R2 = a/sqrt(a)
680+
// to
681+
// R2 = sqrt(a)
682+
// Now, sqrt(-0.0) = -0.0 and doing this simplication would produce -0.0
683+
// instead of NaN.
684+
return I->hasNoSignedZeros() && I->hasAllowReassoc();
685+
};
686+
if (!all_of(R2, R2ConstraintsSatisfied))
687+
return false;
688+
689+
// Check the constraints on X, R1 and R2 combined.
690+
// fdiv instruction and one of the multiplications must reside in the same
691+
// block. If not, the optimized code may execute more ops than before and
692+
// this may hamper the performance.
693+
return (BBx == BBr1 || BBx == BBr2);
694+
}
695+
696+
static void getFSqrtDivOptPattern(Instruction *Div,
697+
SmallVectorImpl<Instruction *> &R1,
698+
SmallVectorImpl<Instruction *> &R2) {
699+
Value *A;
700+
if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
701+
match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
702+
for (User *U : Div->users()) {
703+
Instruction *I = dyn_cast<Instruction>(U);
704+
if (!(I && I->getOpcode() == Instruction::FMul))
705+
continue;
706+
707+
if (match(I, m_FMul(m_Specific(Div), m_Specific(Div)))) {
708+
R1.push_back(I);
709+
continue;
710+
}
711+
}
712+
CallInst *CI = cast<CallInst>(Div->getOperand(1));
713+
for (User *U : CI->users()) {
714+
Instruction *I = dyn_cast<Instruction>(U);
715+
if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A))))) {
716+
R2.push_back(I);
717+
continue;
718+
}
719+
}
720+
}
721+
}
722+
629723
Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
630724
Value *Op0 = I.getOperand(0);
631725
Value *Op1 = I.getOperand(1);
@@ -1796,6 +1890,35 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
17961890
return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
17971891
}
17981892

1893+
static Value *convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
1894+
const SmallVectorImpl<Instruction *> &R1,
1895+
const SmallVectorImpl<Instruction *> &R2,
1896+
Value *SqrtOp,
1897+
InstCombiner::BuilderTy &B) {
1898+
// 1. synthesize tmp1 = 1/a and replace uses of r1
1899+
B.SetInsertPoint(X);
1900+
Value *Tmp1 =
1901+
B.CreateFDivFMF(ConstantFP::get(R1[0]->getType(), 1.0), SqrtOp, R1[0]);
1902+
for (auto *I : R1)
1903+
I->replaceAllUsesWith(Tmp1);
1904+
1905+
// 2. No need of synthesizing Tmp2 again. In this scenario, tmp2 = CI. Replace
1906+
// uses of r2 with tmp2
1907+
for (auto *I : R2)
1908+
I->replaceAllUsesWith(CI);
1909+
1910+
// 3. synthesize tmp3 = tmp1 * tmp2 . Replace uses of 'x' with tmp3
1911+
Value *Tmp3;
1912+
// If x = -1/sqrt(a) initially,then Tmp3 = -(Tmp1*tmp2)
1913+
if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
1914+
Value *Mul = B.CreateFMul(Tmp1, CI);
1915+
Tmp3 = B.CreateFNegFMF(Mul, X);
1916+
} else
1917+
Tmp3 = B.CreateFMulFMF(Tmp1, CI, X);
1918+
1919+
return Tmp3;
1920+
}
1921+
17991922
Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
18001923
Module *M = I.getModule();
18011924

@@ -1820,6 +1943,26 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
18201943
return R;
18211944

18221945
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
1946+
1947+
// Convert
1948+
// x = 1.0/sqrt(a)
1949+
// r1 = x * x;
1950+
// r2 = a/sqrt(a);
1951+
//
1952+
// TO
1953+
//
1954+
// r1 = 1/a
1955+
// r2 = sqrt(a)
1956+
// x = r1 * r2
1957+
SmallVector<Instruction *, 2> R1, R2;
1958+
getFSqrtDivOptPattern(&I, R1, R2);
1959+
if (!R1.empty() && !R2.empty() && isFSqrtDivToFMulLegal(&I, R1, R2)) {
1960+
CallInst *CI = cast<CallInst>(I.getOperand(1));
1961+
Value *SqrtOp = CI->getArgOperand(0);
1962+
if (Value *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, SqrtOp, Builder))
1963+
return replaceInstUsesWith(I, D);
1964+
}
1965+
18231966
if (isa<Constant>(Op0))
18241967
if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
18251968
if (Instruction *R = FoldOpIntoSelect(I, SI))

0 commit comments

Comments
 (0)