Skip to content

Commit 58d73b1

Browse files
committed
[ValueTracking][X86] Compute KnownBits for phadd/phsub
Add KnownBits computations to ValueTracking and X86 DAG lowering. These instructions add/subtract adjacent vector elements in their operands. Example: phadd [X1, X2] [Y1, Y2] = [X1 + X2, Y1 + Y2]. This means that, in this example, we can compute the KnownBits of the operation by computing the KnownBits of [X1, X2] + [X1, X2] and [Y1, Y2] + [Y1, Y2] and intersecting the results. This approach also generalizes to all x86 vector types. There are also the operations phadd.sw and phsub.sw, which perform saturating addition/subtraction. Use sadd_sat and ssub_sat to compute the KnownBits of these operations. Also adjust the existing test case pr53247.ll because it can be transformed to a constant using the new KnownBits computation. Fixes #82516.
1 parent 3caccd8 commit 58d73b1

File tree

8 files changed

+266
-130
lines changed

8 files changed

+266
-130
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,20 @@ void processShuffleMasks(
246246
function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
247247
function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
248248

249+
/// Compute the demanded elements mask of horizontal binary operations. A
250+
/// horizontal operation combines two adjacent elements in a vector operand.
251+
/// This function returns a mask for the elements that correspond to the first
252+
/// operand of this horizontal combination. For example, for two vectors
253+
/// [X1, X2, X3, X4] and [Y1, Y2, Y3, Y4], the resulting mask can include the
254+
/// elements X1, X3, Y1, and Y3. To get the other operands, simply shift the
255+
/// result of this function to the left by 1.
256+
///
257+
/// \param DemandedEltsOp the demanded elements mask for the operation
258+
/// \param DemandedEltsLHS the demanded elements mask for the left operand
259+
/// \param DemandedEltsRHS the demanded elements mask for the right operand
260+
void getHorizontalDemandedElts(const APInt &DemandedEltsOp,
261+
APInt &DemandedEltsLHS, APInt &DemandedEltsRHS);
262+
249263
/// Compute a map of integer instructions to their minimum legal type
250264
/// size.
251265
///

llvm/lib/Analysis/ValueTracking.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,41 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts,
950950
return KnownOut;
951951
}
952952

953+
static KnownBits computeKnownBitsForHorizontalOperation(
954+
const Operator *I, const APInt &DemandedElts, unsigned Depth,
955+
const SimplifyQuery &Q,
956+
const std::function<KnownBits(const KnownBits &, const KnownBits &)>
957+
KnownBitsFunc) {
958+
APInt DemandedEltsLHS, DemandedEltsRHS;
959+
getHorizontalDemandedElts(DemandedElts, DemandedEltsLHS, DemandedEltsRHS);
960+
961+
std::array<KnownBits, 2> KnownLHS;
962+
for (unsigned Index = 0; Index < KnownLHS.size(); ++Index) {
963+
if (!DemandedEltsLHS.isZero()) {
964+
KnownLHS[Index] =
965+
computeKnownBits(I->getOperand(0), DemandedEltsLHS, Depth + 1, Q);
966+
} else {
967+
KnownLHS[Index] = KnownBits(I->getType()->getScalarSizeInBits());
968+
KnownLHS[Index].setAllZero();
969+
}
970+
DemandedEltsLHS <<= 1;
971+
}
972+
std::array<KnownBits, 2> KnownRHS;
973+
for (unsigned Index = 0; Index < KnownRHS.size(); ++Index) {
974+
if (!DemandedEltsRHS.isZero()) {
975+
KnownRHS[Index] =
976+
computeKnownBits(I->getOperand(1), DemandedEltsRHS, Depth + 1, Q);
977+
} else {
978+
KnownRHS[Index] = KnownBits(I->getType()->getScalarSizeInBits());
979+
KnownRHS[Index].setAllZero();
980+
}
981+
DemandedEltsRHS <<= 1;
982+
}
983+
984+
return KnownBitsFunc(KnownLHS[0], KnownLHS[1])
985+
.intersectWith(KnownBitsFunc(KnownRHS[0], KnownRHS[1]));
986+
}
987+
953988
// Public so this can be used in `SimplifyDemandedUseBits`.
954989
KnownBits llvm::analyzeKnownBitsFromAndXorOr(const Operator *I,
955990
const KnownBits &KnownLHS,
@@ -1725,6 +1760,56 @@ static void computeKnownBitsFromOperator(const Operator *I,
17251760
case Intrinsic::x86_sse42_crc32_64_64:
17261761
Known.Zero.setBitsFrom(32);
17271762
break;
1763+
case Intrinsic::x86_ssse3_phadd_d:
1764+
case Intrinsic::x86_ssse3_phadd_w:
1765+
case Intrinsic::x86_ssse3_phadd_d_128:
1766+
case Intrinsic::x86_ssse3_phadd_w_128:
1767+
case Intrinsic::x86_avx2_phadd_d:
1768+
case Intrinsic::x86_avx2_phadd_w: {
1769+
Known = computeKnownBitsForHorizontalOperation(
1770+
I, DemandedElts, Depth, Q,
1771+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
1772+
return KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
1773+
/*NUW=*/false, KnownLHS,
1774+
KnownRHS);
1775+
});
1776+
break;
1777+
}
1778+
case Intrinsic::x86_ssse3_phadd_sw:
1779+
case Intrinsic::x86_ssse3_phadd_sw_128:
1780+
case Intrinsic::x86_avx2_phadd_sw: {
1781+
Known = computeKnownBitsForHorizontalOperation(
1782+
I, DemandedElts, Depth, Q,
1783+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
1784+
return KnownBits::sadd_sat(KnownLHS, KnownRHS);
1785+
});
1786+
break;
1787+
}
1788+
case Intrinsic::x86_ssse3_phsub_d:
1789+
case Intrinsic::x86_ssse3_phsub_w:
1790+
case Intrinsic::x86_ssse3_phsub_d_128:
1791+
case Intrinsic::x86_ssse3_phsub_w_128:
1792+
case Intrinsic::x86_avx2_phsub_d:
1793+
case Intrinsic::x86_avx2_phsub_w: {
1794+
Known = computeKnownBitsForHorizontalOperation(
1795+
I, DemandedElts, Depth, Q,
1796+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
1797+
return KnownBits::computeForAddSub(/*Add=*/false, /*NSW=*/false,
1798+
/*NUW=*/false, KnownLHS,
1799+
KnownRHS);
1800+
});
1801+
break;
1802+
}
1803+
case Intrinsic::x86_ssse3_phsub_sw:
1804+
case Intrinsic::x86_ssse3_phsub_sw_128:
1805+
case Intrinsic::x86_avx2_phsub_sw: {
1806+
Known = computeKnownBitsForHorizontalOperation(
1807+
I, DemandedElts, Depth, Q,
1808+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
1809+
return KnownBits::ssub_sat(KnownLHS, KnownRHS);
1810+
});
1811+
break;
1812+
}
17281813
case Intrinsic::riscv_vsetvli:
17291814
case Intrinsic::riscv_vsetvlimax: {
17301815
bool HasAVL = II->getIntrinsicID() == Intrinsic::riscv_vsetvli;

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,26 @@ void llvm::processShuffleMasks(
541541
}
542542
}
543543

544+
void llvm::getHorizontalDemandedElts(const APInt &DemandedEltsOp,
545+
APInt &DemandedEltsLHS,
546+
APInt &DemandedEltsRHS) {
547+
DemandedEltsLHS = DemandedEltsRHS =
548+
APInt::getZero(DemandedEltsOp.getBitWidth());
549+
550+
unsigned Index = 0;
551+
const auto HalfBitWidth = DemandedEltsOp.getBitWidth() / 2;
552+
for (; Index < HalfBitWidth; ++Index) {
553+
if (DemandedEltsOp[Index]) {
554+
DemandedEltsLHS.setBit(2 * Index);
555+
}
556+
}
557+
for (; Index < DemandedEltsOp.getBitWidth(); ++Index) {
558+
if (DemandedEltsOp[Index]) {
559+
DemandedEltsRHS.setBit(2 * (Index - HalfBitWidth));
560+
}
561+
}
562+
}
563+
544564
MapVector<Instruction *, uint64_t>
545565
llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
546566
const TargetTransformInfo *TTI) {

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36953,6 +36953,41 @@ static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
3695336953
Known = Known.zext(64);
3695436954
}
3695536955

36956+
static KnownBits computeKnownBitsForHorizontalOperation(
36957+
const SDValue Op, const APInt &DemandedElts, unsigned Depth,
36958+
unsigned OpIndexStart, const SelectionDAG &DAG,
36959+
const std::function<KnownBits(const KnownBits &, const KnownBits &)>
36960+
KnownBitsFunc) {
36961+
APInt DemandedEltsLHS, DemandedEltsRHS;
36962+
getHorizontalDemandedElts(DemandedElts, DemandedEltsLHS, DemandedEltsRHS);
36963+
36964+
std::array<KnownBits, 2> KnownLHS;
36965+
for (unsigned Index = 0; Index < KnownLHS.size(); ++Index) {
36966+
if (!DemandedEltsLHS.isZero()) {
36967+
KnownLHS[Index] = DAG.computeKnownBits(Op.getOperand(OpIndexStart),
36968+
DemandedEltsLHS, Depth + 1);
36969+
} else {
36970+
KnownLHS[Index] = KnownBits(Op.getScalarValueSizeInBits());
36971+
KnownLHS[Index].setAllZero();
36972+
}
36973+
DemandedEltsLHS <<= 1;
36974+
}
36975+
std::array<KnownBits, 2> KnownRHS;
36976+
for (unsigned Index = 0; Index < KnownRHS.size(); ++Index) {
36977+
if (!DemandedEltsRHS.isZero()) {
36978+
KnownRHS[Index] = DAG.computeKnownBits(Op.getOperand(OpIndexStart + 1),
36979+
DemandedEltsRHS, Depth + 1);
36980+
} else {
36981+
KnownRHS[Index] = KnownBits(Op.getScalarValueSizeInBits());
36982+
KnownRHS[Index].setAllZero();
36983+
}
36984+
DemandedEltsRHS <<= 1;
36985+
}
36986+
36987+
return KnownBitsFunc(KnownLHS[0], KnownLHS[1])
36988+
.intersectWith(KnownBitsFunc(KnownRHS[0], KnownRHS[1]));
36989+
}
36990+
3695636991
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3695736992
KnownBits &Known,
3695836993
const APInt &DemandedElts,
@@ -37262,6 +37297,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3726237297
}
3726337298
break;
3726437299
}
37300+
case X86ISD::HADD:
37301+
case X86ISD::HSUB: {
37302+
Known = computeKnownBitsForHorizontalOperation(
37303+
Op, DemandedElts, Depth, /*OpIndexStart=*/0, DAG,
37304+
[Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
37305+
return KnownBits::computeForAddSub(
37306+
/*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
37307+
KnownLHS, KnownRHS);
37308+
});
37309+
break;
37310+
}
3726537311
case ISD::INTRINSIC_WO_CHAIN: {
3726637312
switch (Op->getConstantOperandVal(0)) {
3726737313
case Intrinsic::x86_sse2_psad_bw:
@@ -37276,6 +37322,55 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3727637322
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
3727737323
break;
3727837324
}
37325+
case Intrinsic::x86_ssse3_phadd_d:
37326+
case Intrinsic::x86_ssse3_phadd_w:
37327+
case Intrinsic::x86_ssse3_phadd_d_128:
37328+
case Intrinsic::x86_ssse3_phadd_w_128:
37329+
case Intrinsic::x86_avx2_phadd_d:
37330+
case Intrinsic::x86_avx2_phadd_w: {
37331+
Known = computeKnownBitsForHorizontalOperation(
37332+
Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
37333+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
37334+
return KnownBits::computeForAddSub(
37335+
/*Add=*/true, /*NSW=*/false, /*NUW=*/false, KnownLHS, KnownRHS);
37336+
});
37337+
break;
37338+
}
37339+
case Intrinsic::x86_ssse3_phadd_sw:
37340+
case Intrinsic::x86_ssse3_phadd_sw_128:
37341+
case Intrinsic::x86_avx2_phadd_sw: {
37342+
Known = computeKnownBitsForHorizontalOperation(
37343+
Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
37344+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
37345+
return KnownBits::sadd_sat(KnownLHS, KnownRHS);
37346+
});
37347+
break;
37348+
}
37349+
case Intrinsic::x86_ssse3_phsub_d:
37350+
case Intrinsic::x86_ssse3_phsub_w:
37351+
case Intrinsic::x86_ssse3_phsub_d_128:
37352+
case Intrinsic::x86_ssse3_phsub_w_128:
37353+
case Intrinsic::x86_avx2_phsub_d:
37354+
case Intrinsic::x86_avx2_phsub_w: {
37355+
Known = computeKnownBitsForHorizontalOperation(
37356+
Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
37357+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
37358+
return KnownBits::computeForAddSub(/*Add=*/false, /*NSW=*/false,
37359+
/*NUW=*/false, KnownLHS,
37360+
KnownRHS);
37361+
});
37362+
break;
37363+
}
37364+
case Intrinsic::x86_ssse3_phsub_sw:
37365+
case Intrinsic::x86_ssse3_phsub_sw_128:
37366+
case Intrinsic::x86_avx2_phsub_sw: {
37367+
Known = computeKnownBitsForHorizontalOperation(
37368+
Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
37369+
[](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
37370+
return KnownBits::ssub_sat(KnownLHS, KnownRHS);
37371+
});
37372+
break;
37373+
}
3727937374
}
3728037375
break;
3728137376
}

0 commit comments

Comments
 (0)