Skip to content

[SystemZ] Add custom handling of legal vectors with reduce-add. #88495

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
#include "SystemZMachineFunctionInfo.h"
#include "SystemZTargetMachine.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsS390.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include <cctype>
#include <optional>
Expand Down Expand Up @@ -444,6 +446,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);

// Add ISD::VECREDUCE_ADD as custom in order to implement
// it with VZERO+VSUM
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);

// Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
// and inverting the result as necessary.
setOperationAction(ISD::SETCC, VT, Custom);
Expand Down Expand Up @@ -6133,6 +6139,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerOR(Op, DAG);
case ISD::CTPOP:
return lowerCTPOP(Op, DAG);
case ISD::VECREDUCE_ADD:
return lowerVECREDUCE_ADD(Op, DAG);
case ISD::ATOMIC_FENCE:
return lowerATOMIC_FENCE(Op, DAG);
case ISD::ATOMIC_SWAP:
Expand Down Expand Up @@ -9505,3 +9513,38 @@ SDValue SystemZTargetLowering::lowerGET_ROUNDING(SDValue Op,

return DAG.getMergeValues({RetVal, Chain}, dl);
}

SDValue SystemZTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
Op = Op.getOperand(0);
EVT OpVT = Op.getValueType();

assert(OpVT.isVector() && "Operand type for VECREDUCE_ADD is not a vector.");

SDLoc DL(Op);

// load a 0 vector for the third operand of VSUM.
SDValue Zero = DAG.getSplatBuildVector(OpVT, DL, DAG.getConstant(0, DL, VT));

// execute VSUM.
switch (OpVT.getScalarSizeInBits()) {
case 8:
case 16:
Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Zero);
LLVM_FALLTHROUGH;
case 32:
case 64:
Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::i128, Op,
DAG.getBitcast(Op.getValueType(), Zero));
break;
case 128:
break; // VSUM over v1i128 should not happen and would be a noop
default:
llvm_unreachable("Unexpected scalar size.");
}
// Cast to original vector type, retrieve last element.
return DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(OpVT, Op),
DAG.getConstant(OpVT.getVectorNumElements() - 1, DL, MVT::i32));
}
1 change: 1 addition & 0 deletions llvm/lib/Target/SystemZ/SystemZISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
Expand Down
31 changes: 26 additions & 5 deletions llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"

Expand Down Expand Up @@ -1295,18 +1296,14 @@ getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
if (ID == Intrinsic::vector_reduce_add) {
// Retrieve number and size of elements for the vector op.
auto *VTy = cast<FixedVectorType>(ParamTys.front());
unsigned NumElements = VTy->getNumElements();
unsigned ScalarSize = VTy->getScalarSizeInBits();
// For scalar sizes >128 bits, we fall back to the generic cost estimate.
if (ScalarSize > SystemZ::VectorBits)
return -1;
// A single vector register can hold this many elements.
unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
// This many vector regs are needed to represent the input elements (V).
unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
// This many instructions are needed for the final sum of vector elems (S).
unsigned LastVectorHandling =
2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
// We use vector adds to create a sum vector, which takes
// V/2 + V/4 + ... = V - 1 operations.
// Then, we need S operations to sum up the elements of that sum vector,
Expand All @@ -1326,3 +1323,27 @@ SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return Cost;
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}

bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
// Always expand on Subtargets without vector instructions
if (!ST->hasVector())
return true;

// Always expand for operands that do not fill one vector reg
auto *Type = cast<FixedVectorType>(II->getOperand(0)->getType());
unsigned NumElts = Type->getNumElements();
unsigned ScalarSize = Type->getScalarSizeInBits();
unsigned MaxElts = SystemZ::VectorBits / ScalarSize;
if (NumElts < MaxElts)
return true;

// Otherwise
switch (II->getIntrinsicID()) {
// Do not expand vector.reduce.add
case Intrinsic::vector_reduce_add:
// Except for i64, since the performance benefit is dubious there
return ScalarSize >= 64;
default:
return true;
}
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);

bool shouldExpandReduction(const IntrinsicInst *II) const;
/// @}
};

Expand Down
24 changes: 12 additions & 12 deletions llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@ define void @reduce(ptr %src, ptr %dst) {
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
;
; CHECK: Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)

; REDUCEADD64
Expand Down
Loading