Skip to content

Commit 5be756a

Browse files
committed
Review changes
Use IRBuilder functions for creating Ldexp, FMA intrinsic call Use different condition for controlling frem expansion Update test Fixup after merge NaN handling fixes Update tests
1 parent 1812c7f commit 5be756a

File tree

6 files changed

+1048
-1084
lines changed

6 files changed

+1048
-1084
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5672,10 +5672,6 @@ class TargetLowering : public TargetLoweringBase {
56725672
LoadSDNode *OriginalLoad,
56735673
SelectionDAG &DAG) const;
56745674

5675-
/// Indicates whether the FRem instruction should be expanded before
5676-
/// ISel in the LLVM IR.
5677-
virtual bool shouldExpandFRemInIR() const { return false; };
5678-
56795675
private:
56805676
SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
56815677
const SDLoc &DL, DAGCombinerInfo &DCI) const;

llvm/lib/CodeGen/ExpandFp.cpp

Lines changed: 70 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,22 @@
1717
#include "llvm/CodeGen/ExpandFp.h"
1818
#include "llvm/ADT/SmallVector.h"
1919
#include "llvm/Analysis/GlobalsModRef.h"
20+
#include "llvm/Analysis/SimplifyQuery.h"
21+
#include "llvm/Analysis/ValueTracking.h"
22+
#include "llvm/CodeGen/ISDOpcodes.h"
2023
#include "llvm/CodeGen/Passes.h"
2124
#include "llvm/CodeGen/TargetLowering.h"
2225
#include "llvm/CodeGen/TargetPassConfig.h"
2326
#include "llvm/CodeGen/TargetSubtargetInfo.h"
2427
#include "llvm/IR/IRBuilder.h"
2528
#include "llvm/IR/InstIterator.h"
2629
#include "llvm/IR/PassManager.h"
30+
#include "llvm/IR/Module.h"
31+
#include "llvm/IR/RuntimeLibcalls.h"
2732
#include "llvm/InitializePasses.h"
2833
#include "llvm/Pass.h"
2934
#include "llvm/Support/CommandLine.h"
35+
#include "llvm/Support/ErrorHandling.h"
3036
#include "llvm/Target/TargetMachine.h"
3137
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
3238

@@ -89,7 +95,7 @@ class FRemExpander {
8995
/// must match the type for which the class instance has been
9096
/// created. The code will be generated at the insertion point of \p
9197
/// B and the insertion point will be reset at exit.
92-
Value *buildFRem(Value *X, Value *Y) const;
98+
Value *buildFRem(Value *X, Value *Y, SimplifyQuery &SQ) const;
9399

94100
private:
95101
FRemExpander(IRBuilder<> &B, Type *FremTy, short Bits, unsigned long Signbit,
@@ -98,11 +104,6 @@ class FRemExpander {
98104
ExTy(B.getInt32Ty()), Bits(ConstantInt::get(ExTy, Bits)),
99105
One(ConstantInt::get(ExTy, 1)), Signbit(Signbit) {};
100106

101-
Value *createLdexp(Value *Base, Value *Exp, const Twine &Name) const {
102-
return B.CreateIntrinsic(Intrinsic::ldexp, {ComputeFpTy, B.getInt32Ty()},
103-
{Base, Exp}, {}, Name);
104-
}
105-
106107
Value *createRcp(Value *V, const Twine &Name) const {
107108
return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
108109
}
@@ -118,8 +119,7 @@ class FRemExpander {
118119
// ax = clt ? axp : ax;
119120
Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv),
120121
{}, "q");
121-
Value *AxUpdate = B.CreateIntrinsic(Intrinsic::fma, {ComputeFpTy},
122-
{B.CreateFNeg(Q), Ay, Ax}, {}, "ax");
122+
Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax");
123123
Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate,
124124
ConstantFP::get(ComputeFpTy, 0.0), "clt");
125125
Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp");
@@ -145,7 +145,7 @@ class FRemExpander {
145145
Value *Exp = B.CreateExtractValue(Frexp, {1});
146146

147147
Exp = B.CreateSub(Exp, One, ExName);
148-
Value *Pow = createLdexp(Mant, NewExp, PowName);
148+
Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName);
149149

150150
return {Pow, Exp};
151151
}
@@ -194,7 +194,7 @@ class FRemExpander {
194194
AxPhi->addIncoming(Ax, PreheaderBB);
195195

196196
Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv);
197-
AxPhiUpdate = createLdexp(AxPhiUpdate, Bits, "ax_update");
197+
AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update");
198198
AxPhi->addIncoming(AxPhiUpdate, LoopBB);
199199
NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB);
200200

@@ -212,14 +212,14 @@ class FRemExpander {
212212
NbExitPhi->addIncoming(NbIv, LoopBB);
213213
NbExitPhi->addIncoming(Nb, PreheaderBB);
214214

215-
Value *AxFinal = createLdexp(
216-
AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), "ax");
215+
Value *AxFinal = B.CreateLdexp(
216+
AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax");
217217
AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
218218

219219
// Build:
220220
// ax = BUILTIN_FLDEXP_ComputeFpTy(ax, ey);
221221
// ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax));
222-
AxFinal = createLdexp(AxFinal, Ey, "ax");
222+
AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax");
223223

224224
Value *XAsInt = B.CreateBitCast(X, IntTy, "x_as_int");
225225
if (ComputeFpTy != X->getType())
@@ -249,28 +249,32 @@ class FRemExpander {
249249
RetPhi->addIncoming(Ret, B.GetInsertBlock());
250250
}
251251

252-
/// Adjust the result of the main computation from the FRem expansion
253-
/// if NaNs or infinite values are possible.
254-
Value *buildNanAndInfHandling(Value *Ret, Value *X, Value *Y) const {
252+
/// Return a value that is NaN if one of the corner cases concerning
253+
/// the inputs \p X and \p Y is detected, and \p Ret otherwise.
254+
Value *handleInputCornerCases(Value *Ret, Value *X,
255+
Value *Y, SimplifyQuery &SQ) const {
255256
// Build:
256257
// ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
257258
// bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
258259
// BUILTIN_ISFINITE_ComputeFpTy(x);
259260
// ret = c ? ret : QNAN_ComputeFpTy;
260-
// TODO Handle NaN and infinity fast math flags separately here?
261261
Value *Nan = ConstantFP::getQNaN(FremTy);
262-
263-
Ret = B.CreateSelect(B.createIsFPClass(Y, FPClassTest::fcZero), Nan, Ret);
264-
Value *C = B.CreateLogicalAnd(
265-
B.CreateNot(B.createIsFPClass(Y, FPClassTest::fcNan)),
266-
B.createIsFPClass(X, FPClassTest::fcFinite));
262+
Ret = B.CreateSelect(B.CreateFCmpOEQ(Y, ConstantFP::get(FremTy, 0.0)), Nan,
263+
Ret);
264+
FPClassTest NotNan = FPClassTest::fcInf | FPClassTest::fcFinite;
265+
Value *YNotNan =
266+
isKnownNeverNaN(Y, 0, SQ) ? B.getTrue() : B.createIsFPClass(Y, NotNan);
267+
Value *XFinite = isKnownNeverInfinity(X, 0, SQ)
268+
? B.getTrue()
269+
: B.createIsFPClass(X, FPClassTest::fcFinite);
270+
Value *C = B.CreateLogicalAnd(YNotNan, XFinite);
267271
Ret = B.CreateSelect(C, Ret, Nan);
268272

269273
return Ret;
270274
}
271275
};
272276

273-
Value *FRemExpander::buildFRem(Value *X, Value *Y) const {
277+
Value *FRemExpander::buildFRem(Value *X, Value *Y, SimplifyQuery &SQ) const {
274278
assert(X->getType() == FremTy && Y->getType() == FremTy);
275279

276280
FastMathFlags FMF = B.getFastMathFlags();
@@ -293,8 +297,10 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y) const {
293297
PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret");
294298
Value *Ret = RetPhi;
295299

296-
if (!FMF.noNaNs() || !FMF.noInfs())
297-
Ret = buildNanAndInfHandling(Ret, X, Y);
300+
// We would return NaN in all corner cases handled here.
301+
// Hence, if NaNs are excluded, keep the result as it is.
302+
if (!FMF.noNaNs())
303+
Ret = handleInputCornerCases(Ret, X, Y, SQ);
298304

299305
Function *Fun = B.GetInsertBlock()->getParent();
300306
auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
@@ -352,7 +358,7 @@ static bool shouldSkipExpandFRem(BinaryOperator &I) {
352358
isConstOrConstSelectOp(I.getOperand(1));
353359
}
354360

355-
static bool expandFRem(BinaryOperator &I) {
361+
static bool expandFRem(BinaryOperator &I, SimplifyQuery &SQ) {
356362
LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
357363
if (shouldSkipExpandFRem(I)) {
358364
LLVM_DEBUG(
@@ -384,7 +390,7 @@ static bool expandFRem(BinaryOperator &I) {
384390

385391
Value *Ret;
386392
if (ReturnTy->isFloatingPointTy())
387-
Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1));
393+
Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1), SQ);
388394
else {
389395
auto *VecTy = cast<FixedVectorType>(ReturnTy);
390396

@@ -398,7 +404,7 @@ static bool expandFRem(BinaryOperator &I) {
398404
for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
399405
Value *Num = B.CreateExtractElement(Nums, I);
400406
Value *Denum = B.CreateExtractElement(Denums, I);
401-
Value *Rem = Expander->buildFRem(Num, Denum);
407+
Value *Rem = Expander->buildFRem(Num, Denum, SQ);
402408
Ret = B.CreateInsertElement(Ret, Rem, I);
403409
}
404410
}
@@ -963,6 +969,36 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
963969
I->eraseFromParent();
964970
}
965971

972+
// This covers all floating point types; more than we need here.
973+
// TODO Move somewhere else for general use?
974+
/// Return the Libcall for a frem instruction of
975+
/// type \p Ty.
976+
static RTLIB::Libcall fremToLibcall(Type *Ty) {
977+
assert(Ty->isFloatingPointTy());
978+
if (Ty->isFloatTy() || Ty->is16bitFPTy())
979+
return RTLIB::REM_F32;
980+
if (Ty->isDoubleTy())
981+
return RTLIB::REM_F64;
982+
if (Ty->isFP128Ty())
983+
return RTLIB::REM_F128;
984+
if (Ty->isX86_FP80Ty())
985+
return RTLIB::REM_F80;
986+
if (Ty->isPPC_FP128Ty())
987+
return RTLIB::REM_PPCF128;
988+
989+
llvm_unreachable("Unknown floating point type");
990+
}
991+
992+
/* Return true if, according to \p LibInfo, the target either directly
993+
supports the frem instruction for the \p Ty, has a custom lowering,
994+
or uses a libcall. */
995+
static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
996+
if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty)))
997+
return true;
998+
999+
return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
1000+
}
1001+
9661002
static bool runImpl(Function &F, const TargetLowering &TLI) {
9671003
SmallVector<Instruction *, 4> Replace;
9681004
SmallVector<Instruction *, 4> ReplaceVector;
@@ -979,7 +1015,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
9791015
for (auto &I : instructions(F)) {
9801016
switch (I.getOpcode()) {
9811017
case Instruction::FRem:
982-
if (TLI.shouldExpandFRemInIR()) {
1018+
if (!targetSupportsFrem(TLI, I.getType())) {
9831019
Replace.push_back(&I);
9841020
Modified = true;
9851021
}
@@ -1034,10 +1070,11 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
10341070

10351071
while (!Replace.empty()) {
10361072
Instruction *I = Replace.pop_back_val();
1037-
if (I->getOpcode() == Instruction::FRem)
1038-
expandFRem(cast<BinaryOperator>(*I));
1039-
else if (I->getOpcode() == Instruction::FPToUI ||
1040-
I->getOpcode() == Instruction::FPToSI) {
1073+
if (I->getOpcode() == Instruction::FRem) {
1074+
auto SQ = SimplifyQuery{I->getModule()->getDataLayout(), I};
1075+
expandFRem(cast<BinaryOperator>(*I), SQ);
1076+
} else if (I->getOpcode() == Instruction::FPToUI ||
1077+
I->getOpcode() == Instruction::FPToSI) {
10411078
expandFPToI(I);
10421079
} else {
10431080
expandIToFP(I);

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
411411
setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412412
Expand);
413413

414-
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
414+
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
415415

416416
if (Subtarget->has16BitInsts())
417417
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
@@ -1424,7 +1424,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
14241424
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
14251425
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
14261426
case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1427-
case ISD::FREM: return LowerFREM(Op, DAG);
14281427
case ISD::FCEIL: return LowerFCEIL(Op, DAG);
14291428
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
14301429
case ISD::FRINT: return LowerFRINT(Op, DAG);
@@ -2393,21 +2392,6 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
23932392
return DAG.getMergeValues(Res, DL);
23942393
}
23952394

2396-
// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2397-
SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2398-
SDLoc SL(Op);
2399-
EVT VT = Op.getValueType();
2400-
auto Flags = Op->getFlags();
2401-
SDValue X = Op.getOperand(0);
2402-
SDValue Y = Op.getOperand(1);
2403-
2404-
SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2405-
SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2406-
SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2407-
// TODO: For f32 use FMAD instead if !hasFastFMA32?
2408-
return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2409-
}
2410-
24112395
SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
24122396
SDLoc SL(Op);
24132397
SDValue Src = Op.getOperand(0);

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,6 @@ class AMDGPUTargetLowering : public TargetLowering {
387387
MVT getFenceOperandTy(const DataLayout &DL) const override {
388388
return MVT::i32;
389389
}
390-
bool shouldExpandFRemInIR() const override { return true; };
391390
};
392391

393392
namespace AMDGPUISD {

0 commit comments

Comments
 (0)