Skip to content

Pick Vector Dotproduct improvements. #9858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,12 @@ typedef TargetTransformInfo TTI;
/// for IR-level transformations.
class TargetTransformInfo {
public:
enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };

/// Get the kind of extension that an instruction represents.
static PartialReductionExtendKind
getPartialReductionExtendKind(Instruction *I);

/// Construct a TTI object using a type implementing the \c Concept
/// API below.
///
Expand Down Expand Up @@ -1257,6 +1263,20 @@ class TargetTransformInfo {
/// \return if target want to issue a prefetch in address space \p AS.
bool shouldPrefetchAddressSpace(unsigned AS) const;

/// \return The cost of a partial reduction, which is a reduction from a
/// vector to another vector with fewer elements of larger size. They are
/// represented by the llvm.experimental.partial.reduce.add intrinsic, which
/// takes an accumulator and a binary operation operand that itself is fed by
/// two extends. An example of an operation that uses a partial reduction is a
/// dot product, which reduces two vectors to another of 4 times fewer and 4
/// times larger elements.
InstructionCost
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
Type *AccumType, ElementCount VF,
PartialReductionExtendKind OpAExtend,
PartialReductionExtendKind OpBExtend,
std::optional<unsigned> BinOp = std::nullopt) const;

/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
Expand Down Expand Up @@ -2034,6 +2054,20 @@ class TargetTransformInfo::Concept {
/// \return if target want to issue a prefetch in address space \p AS.
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;

/// \return The cost of a partial reduction, which is a reduction from a
/// vector to another vector with fewer elements of larger size. They are
/// represented by the llvm.experimental.partial.reduce.add intrinsic, which
/// takes an accumulator and a binary operation operand that itself is fed by
/// two extends. An example of an operation that uses a partial reduction is a
/// dot product, which reduces two vectors to another of 4 times fewer and 4
/// times larger elements.
virtual InstructionCost
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
Type *AccumType, ElementCount VF,
PartialReductionExtendKind OpAExtend,
PartialReductionExtendKind OpBExtend,
std::optional<unsigned> BinOp) const = 0;

virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
Expand Down Expand Up @@ -2669,6 +2703,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.shouldPrefetchAddressSpace(AS);
}

InstructionCost getPartialReductionCost(
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
ElementCount VF, PartialReductionExtendKind OpAExtend,
PartialReductionExtendKind OpBExtend,
std::optional<unsigned> BinOp = std::nullopt) const override {
return Impl.getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
AccumType, VF, OpAExtend, OpBExtend,
BinOp);
}

unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
Expand Down
9 changes: 9 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,15 @@ class TargetTransformInfoImplBase {
bool enableWritePrefetching() const { return false; }
bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }

InstructionCost
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
Type *AccumType, ElementCount VF,
TTI::PartialReductionExtendKind OpAExtend,
TTI::PartialReductionExtendKind OpBExtend,
std::optional<unsigned> BinOp = std::nullopt) const {
return InstructionCost::getInvalid();
}

unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }

InstructionCost getArithmeticInstrCost(
Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -1587,6 +1587,11 @@ class SelectionDAG {
/// the target's desired shift amount type.
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);

/// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are
/// its operands and ReducedTY is the intrinsic's return type.
SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
SDValue Op2);

/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
SDValue expandVAArg(SDNode *Node);

Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,13 @@ class TargetLoweringBase {
return true;
}

/// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
/// should be expanded using generic code in SelectionDAGBuilder.
virtual bool
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
return true;
}

/// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
/// using generic code in SelectionDAGBuilder.
virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,15 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
return TTIImpl->shouldPrefetchAddressSpace(AS);
}

InstructionCost TargetTransformInfo::getPartialReductionCost(
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
ElementCount VF, PartialReductionExtendKind OpAExtend,
PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp) const {
return TTIImpl->getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
AccumType, VF, OpAExtend, OpBExtend,
BinOp);
}

unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
Expand Down Expand Up @@ -940,6 +949,15 @@ InstructionCost TargetTransformInfo::getShuffleCost(
return Cost;
}

TargetTransformInfo::PartialReductionExtendKind
TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
if (isa<SExtInst>(I))
return PR_SignExtend;
if (isa<ZExtInst>(I))
return PR_ZeroExtend;
return PR_None;
}

TTI::CastContextHint
TargetTransformInfo::getCastContextHint(const Instruction *I) {
if (!I)
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <deque>
#include <limits>
#include <optional>
#include <set>
Expand Down Expand Up @@ -2411,6 +2412,35 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
}

SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
SDValue Op2) {
EVT FullTy = Op2.getValueType();

unsigned Stride = ReducedTy.getVectorMinNumElements();
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;

// Collect all of the subvectors
std::deque<SDValue> Subvectors = {Op1};
for (unsigned I = 0; I < ScaleFactor; I++) {
auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
Subvectors.push_back(
getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex}));
}

// Flatten the subvector tree
while (Subvectors.size() > 1) {
Subvectors.push_back(
getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]}));
Subvectors.pop_front();
Subvectors.pop_front();
}

assert(Subvectors.size() == 1 &&
"There should only be one subvector after tree flattening");

return Subvectors[0];
}

SDValue SelectionDAG::expandVAArg(SDNode *Node) {
SDLoc dl(Node);
const TargetLowering &TLI = getTargetLoweringInfo();
Expand Down
31 changes: 6 additions & 25 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8000,34 +8000,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::experimental_vector_partial_reduce_add: {
SDValue OpNode = getValue(I.getOperand(1));
EVT ReducedTy = EVT::getEVT(I.getType());
EVT FullTy = OpNode.getValueType();

unsigned Stride = ReducedTy.getVectorMinNumElements();
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;

// Collect all of the subvectors
std::deque<SDValue> Subvectors;
Subvectors.push_back(getValue(I.getOperand(0)));
for (unsigned i = 0; i < ScaleFactor; i++) {
auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
{OpNode, SourceIndex}));
}

// Flatten the subvector tree
while (Subvectors.size() > 1) {
Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
{Subvectors[0], Subvectors[1]}));
Subvectors.pop_front();
Subvectors.pop_front();
if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
visitTargetIntrinsic(I, Intrinsic);
return;
}

assert(Subvectors.size() == 1 &&
"There should only be one subvector after tree flattening");

setValue(&I, Subvectors[0]);
setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()),
getValue(I.getOperand(0)),
getValue(I.getOperand(1))));
return;
}
case Intrinsic::experimental_cttz_elts: {
Expand Down
74 changes: 74 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1971,6 +1971,16 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
return false;
}

bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
const IntrinsicInst *I) const {
if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
return true;

EVT VT = EVT::getEVT(I->getType());
return VT != MVT::nxv4i32 && VT != MVT::nxv2i64 && VT != MVT::v4i32 &&
VT != MVT::v2i32;
}

bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
if (!Subtarget->isSVEorStreamingSVEAvailable())
return true;
Expand Down Expand Up @@ -21250,6 +21260,64 @@ static SDValue tryCombineWhileLo(SDNode *N,
return SDValue(N, 0);
}

SDValue tryLowerPartialReductionToDot(SDNode *N,
const AArch64Subtarget *Subtarget,
SelectionDAG &DAG) {

assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
getIntrinsicID(N) ==
Intrinsic::experimental_vector_partial_reduce_add &&
"Expected a partial reduction node");

bool Scalable = N->getValueType(0).isScalableVector();
if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
return SDValue();
if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
return SDValue();

SDLoc DL(N);

// The narrower of the two operands. Used as the accumulator
auto NarrowOp = N->getOperand(1);
auto MulOp = N->getOperand(2);
if (MulOp->getOpcode() != ISD::MUL)
return SDValue();

auto ExtA = MulOp->getOperand(0);
auto ExtB = MulOp->getOperand(1);
bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
if (ExtA->getOpcode() != ExtB->getOpcode() || (!IsSExt && !IsZExt))
return SDValue();

auto A = ExtA->getOperand(0);
auto B = ExtB->getOperand(0);
if (A.getValueType() != B.getValueType())
return SDValue();

unsigned Opcode = 0;

if (IsSExt)
Opcode = AArch64ISD::SDOT;
else if (IsZExt)
Opcode = AArch64ISD::UDOT;

assert(Opcode != 0 && "Unexpected dot product case encountered.");

EVT ReducedType = N->getValueType(0);
EVT MulSrcType = A.getValueType();

// Dot products operate on chunks of four elements so there must be four times
// as many elements in the wide type
if ((ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) ||
(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) ||
(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) ||
(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);

return SDValue();
}

static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
Expand All @@ -21258,6 +21326,12 @@ static SDValue performIntrinsicCombine(SDNode *N,
switch (IID) {
default:
break;
case Intrinsic::experimental_vector_partial_reduce_add: {
if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
return Dot;
return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
}
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -991,6 +991,9 @@ class AArch64TargetLowering : public TargetLowering {

bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;

bool
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;

bool shouldExpandCttzElements(EVT VT) const override;

/// If a change in streaming mode is required on entry to/return from a
Expand Down
Loading