Skip to content

Commit 1faa479

Browse files
committed
AMDGPU: Handle unsafe exp.f32 with denormal handling
I somehow missed this path when adding the new expansions. Saves a lot of instructions for afn + IEEE. https://reviews.llvm.org/D157867
1 parent d45022b commit 1faa479

File tree

3 files changed

+458
-942
lines changed

3 files changed

+458
-942
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2760,14 +2760,40 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
27602760
return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
27612761
}
27622762

2763-
SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL,
2763+
SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
27642764
SelectionDAG &DAG,
27652765
SDNodeFlags Flags) const {
2766-
// exp2(M_LOG2E_F * f);
2767-
EVT VT = Op.getValueType();
2768-
const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2769-
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags);
2770-
return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul,
2766+
EVT VT = X.getValueType();
2767+
const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2768+
2769+
if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2770+
// exp2(M_LOG2E_F * f);
2771+
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2772+
return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT,
2773+
Mul, Flags);
2774+
}
2775+
2776+
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2777+
2778+
SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2779+
SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2780+
2781+
SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2782+
2783+
SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2784+
2785+
SDValue AdjustedX =
2786+
DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2787+
2788+
SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2789+
2790+
SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2791+
2792+
SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2793+
SDValue AdjustedResult =
2794+
DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2795+
2796+
return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
27712797
Flags);
27722798
}
27732799

@@ -2800,7 +2826,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
28002826

28012827
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
28022828
// library behavior. Also, is known-not-daz source sufficient?
2803-
if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) {
2829+
if (allowApproxFunc(DAG, Flags)) {
28042830
assert(!IsExp10 && "todo exp10 support");
28052831
return lowerFEXPUnsafe(X, SL, DAG, Flags);
28062832
}

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3304,20 +3304,42 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
33043304
}
33053305

33063306
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3307-
Register Src,
3308-
unsigned Flags) const {
3307+
Register X, unsigned Flags) const {
33093308
LLT Ty = B.getMRI()->getType(Dst);
3310-
auto K = B.buildFConstant(Ty, numbers::log2e);
3311-
auto Mul = B.buildFMul(Ty, Src, K, Flags);
3309+
LLT F32 = LLT::scalar(32);
33123310

3313-
if (Ty == LLT::scalar(32)) {
3314-
B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3311+
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3312+
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3313+
auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3314+
3315+
if (Ty == F32) {
3316+
B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
33153317
.addUse(Mul.getReg(0))
33163318
.setMIFlags(Flags);
3317-
} else {
3318-
B.buildFExp2(Dst, Mul.getReg(0), Flags);
3319+
} else {
3320+
B.buildFExp2(Dst, Mul.getReg(0), Flags);
3321+
}
3322+
3323+
return true;
33193324
}
33203325

3326+
auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3327+
auto NeedsScaling =
3328+
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3329+
auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3330+
auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3331+
auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3332+
3333+
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3334+
auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3335+
3336+
auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3337+
.addUse(ExpInput.getReg(0))
3338+
.setMIFlags(Flags);
3339+
3340+
auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3341+
auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3342+
B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
33213343
return true;
33223344
}
33233345

@@ -3358,7 +3380,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
33583380

33593381
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
33603382
// library behavior. Also, is known-not-daz source sufficient?
3361-
if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
3383+
if (allowApproxFunc(MF, Flags)) {
33623384
legalizeFExpUnsafe(B, Dst, X, Flags);
33633385
MI.eraseFromParent();
33643386
return true;

0 commit comments

Comments
 (0)