Skip to content

Commit a0ef575

Browse files
authored
Merge pull request #9858 from fhahn/pick-dotprod
Pick Vector Dotproduct improvements.
2 parents dbb0934 + 66ddce5 commit a0ef575

23 files changed

+7725
-43
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,12 @@ typedef TargetTransformInfo TTI;
213213
/// for IR-level transformations.
214214
class TargetTransformInfo {
215215
public:
216+
enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
217+
218+
/// Get the kind of extension that an instruction represents.
219+
static PartialReductionExtendKind
220+
getPartialReductionExtendKind(Instruction *I);
221+
216222
/// Construct a TTI object using a type implementing the \c Concept
217223
/// API below.
218224
///
@@ -1257,6 +1263,20 @@ class TargetTransformInfo {
12571263
/// \return if target want to issue a prefetch in address space \p AS.
12581264
bool shouldPrefetchAddressSpace(unsigned AS) const;
12591265

1266+
/// \return The cost of a partial reduction, which is a reduction from a
1267+
/// vector to another vector with fewer elements of larger size. They are
1268+
/// represented by the llvm.experimental.partial.reduce.add intrinsic, which
1269+
/// takes an accumulator and a binary operation operand that itself is fed by
1270+
/// two extends. An example of an operation that uses a partial reduction is a
1271+
/// dot product, which reduces two vectors to another of 4 times fewer and 4
1272+
/// times larger elements.
1273+
InstructionCost
1274+
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
1275+
Type *AccumType, ElementCount VF,
1276+
PartialReductionExtendKind OpAExtend,
1277+
PartialReductionExtendKind OpBExtend,
1278+
std::optional<unsigned> BinOp = std::nullopt) const;
1279+
12601280
/// \return The maximum interleave factor that any transform should try to
12611281
/// perform for this target. This number depends on the level of parallelism
12621282
/// and the number of execution units in the CPU.
@@ -2034,6 +2054,20 @@ class TargetTransformInfo::Concept {
20342054
/// \return if target want to issue a prefetch in address space \p AS.
20352055
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
20362056

2057+
/// \return The cost of a partial reduction, which is a reduction from a
2058+
/// vector to another vector with fewer elements of larger size. They are
2059+
/// represented by the llvm.experimental.partial.reduce.add intrinsic, which
2060+
/// takes an accumulator and a binary operation operand that itself is fed by
2061+
/// two extends. An example of an operation that uses a partial reduction is a
2062+
/// dot product, which reduces two vectors to another of 4 times fewer and 4
2063+
/// times larger elements.
2064+
virtual InstructionCost
2065+
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
2066+
Type *AccumType, ElementCount VF,
2067+
PartialReductionExtendKind OpAExtend,
2068+
PartialReductionExtendKind OpBExtend,
2069+
std::optional<unsigned> BinOp) const = 0;
2070+
20372071
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
20382072
virtual InstructionCost getArithmeticInstrCost(
20392073
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2669,6 +2703,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
26692703
return Impl.shouldPrefetchAddressSpace(AS);
26702704
}
26712705

2706+
InstructionCost getPartialReductionCost(
2707+
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
2708+
ElementCount VF, PartialReductionExtendKind OpAExtend,
2709+
PartialReductionExtendKind OpBExtend,
2710+
std::optional<unsigned> BinOp = std::nullopt) const override {
2711+
return Impl.getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
2712+
AccumType, VF, OpAExtend, OpBExtend,
2713+
BinOp);
2714+
}
2715+
26722716
unsigned getMaxInterleaveFactor(ElementCount VF) override {
26732717
return Impl.getMaxInterleaveFactor(VF);
26742718
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,15 @@ class TargetTransformInfoImplBase {
543543
bool enableWritePrefetching() const { return false; }
544544
bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
545545

546+
InstructionCost
547+
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
548+
Type *AccumType, ElementCount VF,
549+
TTI::PartialReductionExtendKind OpAExtend,
550+
TTI::PartialReductionExtendKind OpBExtend,
551+
std::optional<unsigned> BinOp = std::nullopt) const {
552+
return InstructionCost::getInvalid();
553+
}
554+
546555
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
547556

548557
InstructionCost getArithmeticInstrCost(

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,6 +1587,11 @@ class SelectionDAG {
15871587
/// the target's desired shift amount type.
15881588
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
15891589

1590+
/// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are
1591+
/// its operands and ReducedTY is the intrinsic's return type.
1592+
SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
1593+
SDValue Op2);
1594+
15901595
/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
15911596
SDValue expandVAArg(SDNode *Node);
15921597

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,13 @@ class TargetLoweringBase {
454454
return true;
455455
}
456456

457+
/// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
458+
/// should be expanded using generic code in SelectionDAGBuilder.
459+
virtual bool
460+
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
461+
return true;
462+
}
463+
457464
/// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
458465
/// using generic code in SelectionDAGBuilder.
459466
virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,15 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
829829
return TTIImpl->shouldPrefetchAddressSpace(AS);
830830
}
831831

832+
InstructionCost TargetTransformInfo::getPartialReductionCost(
833+
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
834+
ElementCount VF, PartialReductionExtendKind OpAExtend,
835+
PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp) const {
836+
return TTIImpl->getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
837+
AccumType, VF, OpAExtend, OpBExtend,
838+
BinOp);
839+
}
840+
832841
unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
833842
return TTIImpl->getMaxInterleaveFactor(VF);
834843
}
@@ -940,6 +949,15 @@ InstructionCost TargetTransformInfo::getShuffleCost(
940949
return Cost;
941950
}
942951

952+
TargetTransformInfo::PartialReductionExtendKind
953+
TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
954+
if (isa<SExtInst>(I))
955+
return PR_SignExtend;
956+
if (isa<ZExtInst>(I))
957+
return PR_ZeroExtend;
958+
return PR_None;
959+
}
960+
943961
TTI::CastContextHint
944962
TargetTransformInfo::getCastContextHint(const Instruction *I) {
945963
if (!I)

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
#include <cassert>
7575
#include <cstdint>
7676
#include <cstdlib>
77+
#include <deque>
7778
#include <limits>
7879
#include <optional>
7980
#include <set>
@@ -2411,6 +2412,35 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
24112412
return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
24122413
}
24132414

2415+
SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
2416+
SDValue Op2) {
2417+
EVT FullTy = Op2.getValueType();
2418+
2419+
unsigned Stride = ReducedTy.getVectorMinNumElements();
2420+
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
2421+
2422+
// Collect all of the subvectors
2423+
std::deque<SDValue> Subvectors = {Op1};
2424+
for (unsigned I = 0; I < ScaleFactor; I++) {
2425+
auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
2426+
Subvectors.push_back(
2427+
getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex}));
2428+
}
2429+
2430+
// Flatten the subvector tree
2431+
while (Subvectors.size() > 1) {
2432+
Subvectors.push_back(
2433+
getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]}));
2434+
Subvectors.pop_front();
2435+
Subvectors.pop_front();
2436+
}
2437+
2438+
assert(Subvectors.size() == 1 &&
2439+
"There should only be one subvector after tree flattening");
2440+
2441+
return Subvectors[0];
2442+
}
2443+
24142444
SDValue SelectionDAG::expandVAArg(SDNode *Node) {
24152445
SDLoc dl(Node);
24162446
const TargetLowering &TLI = getTargetLoweringInfo();

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8000,34 +8000,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
80008000
return;
80018001
}
80028002
case Intrinsic::experimental_vector_partial_reduce_add: {
8003-
SDValue OpNode = getValue(I.getOperand(1));
8004-
EVT ReducedTy = EVT::getEVT(I.getType());
8005-
EVT FullTy = OpNode.getValueType();
80068003

8007-
unsigned Stride = ReducedTy.getVectorMinNumElements();
8008-
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
8009-
8010-
// Collect all of the subvectors
8011-
std::deque<SDValue> Subvectors;
8012-
Subvectors.push_back(getValue(I.getOperand(0)));
8013-
for (unsigned i = 0; i < ScaleFactor; i++) {
8014-
auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
8015-
Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
8016-
{OpNode, SourceIndex}));
8017-
}
8018-
8019-
// Flatten the subvector tree
8020-
while (Subvectors.size() > 1) {
8021-
Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
8022-
{Subvectors[0], Subvectors[1]}));
8023-
Subvectors.pop_front();
8024-
Subvectors.pop_front();
8004+
if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
8005+
visitTargetIntrinsic(I, Intrinsic);
8006+
return;
80258007
}
80268008

8027-
assert(Subvectors.size() == 1 &&
8028-
"There should only be one subvector after tree flattening");
8029-
8030-
setValue(&I, Subvectors[0]);
8009+
setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()),
8010+
getValue(I.getOperand(0)),
8011+
getValue(I.getOperand(1))));
80318012
return;
80328013
}
80338014
case Intrinsic::experimental_cttz_elts: {

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1971,6 +1971,16 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
19711971
return false;
19721972
}
19731973

1974+
bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
1975+
const IntrinsicInst *I) const {
1976+
if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
1977+
return true;
1978+
1979+
EVT VT = EVT::getEVT(I->getType());
1980+
return VT != MVT::nxv4i32 && VT != MVT::nxv2i64 && VT != MVT::v4i32 &&
1981+
VT != MVT::v2i32;
1982+
}
1983+
19741984
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
19751985
if (!Subtarget->isSVEorStreamingSVEAvailable())
19761986
return true;
@@ -21250,6 +21260,64 @@ static SDValue tryCombineWhileLo(SDNode *N,
2125021260
return SDValue(N, 0);
2125121261
}
2125221262

21263+
SDValue tryLowerPartialReductionToDot(SDNode *N,
21264+
const AArch64Subtarget *Subtarget,
21265+
SelectionDAG &DAG) {
21266+
21267+
assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21268+
getIntrinsicID(N) ==
21269+
Intrinsic::experimental_vector_partial_reduce_add &&
21270+
"Expected a partial reduction node");
21271+
21272+
bool Scalable = N->getValueType(0).isScalableVector();
21273+
if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
21274+
return SDValue();
21275+
if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
21276+
return SDValue();
21277+
21278+
SDLoc DL(N);
21279+
21280+
// The narrower of the two operands. Used as the accumulator
21281+
auto NarrowOp = N->getOperand(1);
21282+
auto MulOp = N->getOperand(2);
21283+
if (MulOp->getOpcode() != ISD::MUL)
21284+
return SDValue();
21285+
21286+
auto ExtA = MulOp->getOperand(0);
21287+
auto ExtB = MulOp->getOperand(1);
21288+
bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
21289+
bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
21290+
if (ExtA->getOpcode() != ExtB->getOpcode() || (!IsSExt && !IsZExt))
21291+
return SDValue();
21292+
21293+
auto A = ExtA->getOperand(0);
21294+
auto B = ExtB->getOperand(0);
21295+
if (A.getValueType() != B.getValueType())
21296+
return SDValue();
21297+
21298+
unsigned Opcode = 0;
21299+
21300+
if (IsSExt)
21301+
Opcode = AArch64ISD::SDOT;
21302+
else if (IsZExt)
21303+
Opcode = AArch64ISD::UDOT;
21304+
21305+
assert(Opcode != 0 && "Unexpected dot product case encountered.");
21306+
21307+
EVT ReducedType = N->getValueType(0);
21308+
EVT MulSrcType = A.getValueType();
21309+
21310+
// Dot products operate on chunks of four elements so there must be four times
21311+
// as many elements in the wide type
21312+
if ((ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) ||
21313+
(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) ||
21314+
(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) ||
21315+
(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
21316+
return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
21317+
21318+
return SDValue();
21319+
}
21320+
2125321321
static SDValue performIntrinsicCombine(SDNode *N,
2125421322
TargetLowering::DAGCombinerInfo &DCI,
2125521323
const AArch64Subtarget *Subtarget) {
@@ -21258,6 +21326,12 @@ static SDValue performIntrinsicCombine(SDNode *N,
2125821326
switch (IID) {
2125921327
default:
2126021328
break;
21329+
case Intrinsic::experimental_vector_partial_reduce_add: {
21330+
if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
21331+
return Dot;
21332+
return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
21333+
N->getOperand(1), N->getOperand(2));
21334+
}
2126121335
case Intrinsic::aarch64_neon_vcvtfxs2fp:
2126221336
case Intrinsic::aarch64_neon_vcvtfxu2fp:
2126321337
return tryCombineFixedPointConvert(N, DCI, DAG);

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -991,6 +991,9 @@ class AArch64TargetLowering : public TargetLowering {
991991

992992
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
993993

994+
bool
995+
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
996+
994997
bool shouldExpandCttzElements(EVT VT) const override;
995998

996999
/// If a change in streaming mode is required on entry to/return from a

0 commit comments

Comments
 (0)