Skip to content

[AArch64][SVE] Add partial reduction SDNodes #117185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions llvm/include/llvm/CodeGen/ISDOpcodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1451,6 +1451,21 @@ enum NodeType {
VECREDUCE_UMAX,
VECREDUCE_UMIN,

// Partial Reduction nodes. These represent multiply-add instructions because
// Input1 and Input2 are multiplied together first. This result is then
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the behaviour of this node is that Input1 and Input2 are first sign- (for SMLA) or zero- (for UMLA) extended to be the same element type as Acc.

nit: Input1 and Input2 are not defined yet. What about starting with a description of the node with operands, e.g. PARTIAL_REDUCE_*MLA(Acc, Input1, Input2) ?

// reduced, by addition, to the number of elements that the Accumulator's type
// has.
// Input1 and Input2 must be the same type. The Accumulator and the Output
// must be the same type.
// The number of elements in Input1 and Input2 must be a positive integer
// multiple of the number of elements in the Accumulator / Output type.
// Input1 and Input2 may have a different element type from Accumulator and
// Output.
// Operands: Accumulator, Input1, Input2
// Outputs: Output
PARTIAL_REDUCE_SMLA,
PARTIAL_REDUCE_UMLA,

// The `llvm.experimental.stackmap` intrinsic.
// Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
// Outputs: output chain, glue
Expand Down
11 changes: 7 additions & 4 deletions llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -1604,10 +1604,13 @@ class SelectionDAG {
/// the target's desired shift amount type.
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);

/// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are
/// its operands and ReducedTY is the intrinsic's return type.
SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
SDValue Op2);
/// Expands PARTIAL_REDUCE_S/UMLA nodes.
/// \p Acc Accumulator for where the result is stored for the partial
/// reduction operation.
/// \p Input1 First input for the partial reduction operation.
/// \p Input2 Second input for the partial reduction operation.
SDValue expandPartialReduceMLA(SDLoc DL, SDValue Acc, SDValue Input1,
SDValue Input2);

/// Expands a node with multiple results to an FP or vector libcall. The
/// libcall is expected to take all the operands of the \p Node followed by
Expand Down
7 changes: 0 additions & 7 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,13 +455,6 @@ class TargetLoweringBase {
return true;
}

/// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
/// should be expanded using generic code in SelectionDAGBuilder.
virtual bool
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
return true;
}

/// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
/// using generic code in SelectionDAGBuilder.
virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
Expand Down
20 changes: 15 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2467,19 +2467,29 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
}

SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
SDValue Op2) {
EVT FullTy = Op2.getValueType();
SDValue SelectionDAG::expandPartialReduceMLA(SDLoc DL, SDValue Acc,
SDValue Input1, SDValue Input2) {

EVT FullTy = Input1.getValueType();
unsigned Input2Opcode = Input2.getOpcode();

SDValue Input = Input1;
if ((Input2Opcode != ISD::SPLAT_VECTOR &&
Input2Opcode != ISD::BUILD_VECTOR) ||
!isOneConstant(Input2.getOperand(0)))
Input = getNode(ISD::MUL, DL, FullTy, Input1, Input2);

EVT ReducedTy = Acc.getValueType();

unsigned Stride = ReducedTy.getVectorMinNumElements();
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;

// Collect all of the subvectors
std::deque<SDValue> Subvectors = {Op1};
std::deque<SDValue> Subvectors = {Acc};
for (unsigned I = 0; I < ScaleFactor; I++) {
auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
Subvectors.push_back(
getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex}));
getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Input, SourceIndex}));
}

// Flatten the subvector tree
Expand Down
29 changes: 20 additions & 9 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8118,15 +8118,26 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::experimental_vector_partial_reduce_add: {

if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
visitTargetIntrinsic(I, Intrinsic);
return;
}

setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()),
getValue(I.getOperand(0)),
getValue(I.getOperand(1))));
SDLoc dl = getCurSDLoc();
SDValue Acc = getValue(I.getOperand(0));
EVT AccVT = Acc.getValueType();
SDValue Input = getValue(I.getOperand(1));
EVT InputVT = Input.getValueType();

assert(AccVT.getVectorElementType() == InputVT.getVectorElementType() &&
"Expected operands to have the same vector element type!");
assert(InputVT.getVectorElementCount().getKnownMinValue() %
AccVT.getVectorElementCount().getKnownMinValue() ==
0 &&
"Expected the element count of the Input operand to be a positive "
"integer multiple of the element count of the Accumulator operand!");

// ISD::PARTIAL_REDUCE_UMLA is chosen arbitrarily and would function the
// same if ISD::PARTIAL_REDUCE_SMLA was used instead. It should be changed
// to its correct signedness when combining or expanding, according to
// extends being performed on Input.
setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, dl, AccVT, Acc, Input,
DAG.getConstant(1, dl, InputVT)));
return;
}
case Intrinsic::experimental_cttz_elts: {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::VECTOR_FIND_LAST_ACTIVE:
return "find_last_active";

case ISD::PARTIAL_REDUCE_UMLA:
return "partial_reduce_umla";
case ISD::PARTIAL_REDUCE_SMLA:
return "partial_reduce_smla";

// Vector Predication
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
case ISD::SDID: \
Expand Down
Loading
Loading