Skip to content

Commit 06ab30b

Browse files
authored
[AMDGPU] Constant folding of llvm.amdgcn.trig.preop (#98562)
If the parameters(the input and segment select) coming in to amdgcn.trig.preop intrinsic are compile time constants, we pre-compute the output of amdgcn.trig.preop on the CPU and replaces the uses with the computed constant. This work extends the patch https://reviews.llvm.org/D120150 to make it a complete coverage. For the segment select, only src1[4:0] are used. A segment select is invalid if we are selecting the 53-bit segment beyond the [1200:0] range of the 2/PI table. 0 is returned when a segment select is not valid.
1 parent fe04aaf commit 06ab30b

File tree

2 files changed

+302
-61
lines changed

2 files changed

+302
-61
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,78 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11021102

11031103
break;
11041104
}
1105+
case Intrinsic::amdgcn_trig_preop: {
1106+
// The intrinsic is declared with name mangling, but currently the
1107+
// instruction only exists for f64
1108+
if (!II.getType()->isDoubleTy())
1109+
break;
1110+
1111+
Value *Src = II.getArgOperand(0);
1112+
Value *Segment = II.getArgOperand(1);
1113+
if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1114+
return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1115+
1116+
if (isa<UndefValue>(Src)) {
1117+
auto *QNaN = ConstantFP::get(
1118+
II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1119+
return IC.replaceInstUsesWith(II, QNaN);
1120+
}
1121+
1122+
const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1123+
if (!Csrc)
1124+
break;
1125+
1126+
if (II.isStrictFP())
1127+
break;
1128+
1129+
const APFloat &Fsrc = Csrc->getValueAPF();
1130+
if (Fsrc.isNaN()) {
1131+
auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1132+
return IC.replaceInstUsesWith(II, Quieted);
1133+
}
1134+
1135+
const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1136+
if (!Cseg)
1137+
break;
1138+
1139+
unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1140+
unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1141+
unsigned Shift = SegmentVal * 53;
1142+
if (Exponent > 1077)
1143+
Shift += Exponent - 1077;
1144+
1145+
// 2.0/PI table.
1146+
static const uint32_t TwoByPi[] = {
1147+
0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1148+
0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1149+
0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1150+
0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1151+
0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1152+
0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1153+
0x56033046};
1154+
1155+
// Return 0 for outbound segment (hardware behavior).
1156+
unsigned Idx = Shift >> 5;
1157+
if (Idx + 2 >= std::size(TwoByPi)) {
1158+
APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1159+
return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1160+
}
1161+
1162+
unsigned BShift = Shift & 0x1f;
1163+
uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1164+
uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1165+
if (BShift)
1166+
Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1167+
Thi = Thi >> 11;
1168+
APFloat Result = APFloat((double)Thi);
1169+
1170+
int Scale = -53 - Shift;
1171+
if (Exponent >= 1968)
1172+
Scale += 128;
1173+
1174+
Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1175+
return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1176+
}
11051177
case Intrinsic::amdgcn_fmul_legacy: {
11061178
Value *Op0 = II.getArgOperand(0);
11071179
Value *Op1 = II.getArgOperand(1);

0 commit comments

Comments
 (0)