Skip to content

Commit b794dc2

Browse files
[SystemZ] Add custom handling of legal vectors with reduce-add. (#88495)
This commit skips the expansion of the `vector.reduce.add` intrinsic on vector-enabled SystemZ targets in order to introduce custom handling of `vector.reduce.add` for legal vector types using the VSUM instructions. This is limited to full vectors with scalar types up to `i32` due to performance concerns. It also adds testing for the generation of such custom handling, and adapts the related cost computation, as well as the testing for that. The expected result is a performance boost in certain benchmarks that make heavy use of `vector.reduce.add` with other benchmarks remaining constant. For instance, the assembly for `vector.reduce.add<4 x i32>` changes from ```hlasm vmrlg %v0, %v24, %v24 vaf %v0, %v24, %v0 vrepf %v1, %v0, 1 vaf %v0, %v0, %v1 vlgvf %r2, %v0, 0 ``` to ```hlasm vgbm %v0, 0 vsumqf %v0, %v24, %v0 vlgvf %r2, %v0, 3 ```
1 parent b614e5b commit b794dc2

File tree

6 files changed

+373
-17
lines changed

6 files changed

+373
-17
lines changed

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@
1616
#include "SystemZMachineFunctionInfo.h"
1717
#include "SystemZTargetMachine.h"
1818
#include "llvm/CodeGen/CallingConvLower.h"
19+
#include "llvm/CodeGen/ISDOpcodes.h"
1920
#include "llvm/CodeGen/MachineInstrBuilder.h"
2021
#include "llvm/CodeGen/MachineRegisterInfo.h"
2122
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
2223
#include "llvm/IR/IntrinsicInst.h"
2324
#include "llvm/IR/Intrinsics.h"
2425
#include "llvm/IR/IntrinsicsS390.h"
2526
#include "llvm/Support/CommandLine.h"
27+
#include "llvm/Support/ErrorHandling.h"
2628
#include "llvm/Support/KnownBits.h"
2729
#include <cctype>
2830
#include <optional>
@@ -451,6 +453,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
451453
setOperationAction(ISD::SRL, VT, Custom);
452454
setOperationAction(ISD::ROTL, VT, Custom);
453455

456+
// Add ISD::VECREDUCE_ADD as custom in order to implement
457+
// it with VZERO+VSUM
458+
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
459+
454460
// Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
455461
// and inverting the result as necessary.
456462
setOperationAction(ISD::SETCC, VT, Custom);
@@ -6167,6 +6173,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
61676173
return lowerOR(Op, DAG);
61686174
case ISD::CTPOP:
61696175
return lowerCTPOP(Op, DAG);
6176+
case ISD::VECREDUCE_ADD:
6177+
return lowerVECREDUCE_ADD(Op, DAG);
61706178
case ISD::ATOMIC_FENCE:
61716179
return lowerATOMIC_FENCE(Op, DAG);
61726180
case ISD::ATOMIC_SWAP:
@@ -9600,3 +9608,38 @@ SDValue SystemZTargetLowering::lowerGET_ROUNDING(SDValue Op,
96009608

96019609
return DAG.getMergeValues({RetVal, Chain}, dl);
96029610
}
9611+
9612+
SDValue SystemZTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
9613+
SelectionDAG &DAG) const {
9614+
EVT VT = Op.getValueType();
9615+
Op = Op.getOperand(0);
9616+
EVT OpVT = Op.getValueType();
9617+
9618+
assert(OpVT.isVector() && "Operand type for VECREDUCE_ADD is not a vector.");
9619+
9620+
SDLoc DL(Op);
9621+
9622+
// load a 0 vector for the third operand of VSUM.
9623+
SDValue Zero = DAG.getSplatBuildVector(OpVT, DL, DAG.getConstant(0, DL, VT));
9624+
9625+
// execute VSUM.
9626+
switch (OpVT.getScalarSizeInBits()) {
9627+
case 8:
9628+
case 16:
9629+
Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Zero);
9630+
LLVM_FALLTHROUGH;
9631+
case 32:
9632+
case 64:
9633+
Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::i128, Op,
9634+
DAG.getBitcast(Op.getValueType(), Zero));
9635+
break;
9636+
case 128:
9637+
break; // VSUM over v1i128 should not happen and would be a noop
9638+
default:
9639+
llvm_unreachable("Unexpected scalar size.");
9640+
}
9641+
// Cast to original vector type, retrieve last element.
9642+
return DAG.getNode(
9643+
ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(OpVT, Op),
9644+
DAG.getConstant(OpVT.getVectorNumElements() - 1, DL, MVT::i32));
9645+
}

llvm/lib/Target/SystemZ/SystemZISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ class SystemZTargetLowering : public TargetLowering {
696696
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
697697
SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
698698
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
699+
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
699700
SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
700701
SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
701702
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/CodeGen/CostTable.h"
2020
#include "llvm/CodeGen/TargetLowering.h"
2121
#include "llvm/IR/IntrinsicInst.h"
22+
#include "llvm/IR/Intrinsics.h"
2223
#include "llvm/Support/Debug.h"
2324
#include "llvm/Support/MathExtras.h"
2425

@@ -1293,18 +1294,14 @@ getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
12931294
if (ID == Intrinsic::vector_reduce_add) {
12941295
// Retrieve number and size of elements for the vector op.
12951296
auto *VTy = cast<FixedVectorType>(ParamTys.front());
1296-
unsigned NumElements = VTy->getNumElements();
12971297
unsigned ScalarSize = VTy->getScalarSizeInBits();
12981298
// For scalar sizes >128 bits, we fall back to the generic cost estimate.
12991299
if (ScalarSize > SystemZ::VectorBits)
13001300
return -1;
1301-
// A single vector register can hold this many elements.
1302-
unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
13031301
// This many vector regs are needed to represent the input elements (V).
13041302
unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
13051303
// This many instructions are needed for the final sum of vector elems (S).
1306-
unsigned LastVectorHandling =
1307-
2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
1304+
unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
13081305
// We use vector adds to create a sum vector, which takes
13091306
// V/2 + V/4 + ... = V - 1 operations.
13101307
// Then, we need S operations to sum up the elements of that sum vector,
@@ -1324,3 +1321,27 @@ SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
13241321
return Cost;
13251322
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
13261323
}
1324+
1325+
bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
1326+
// Always expand on Subtargets without vector instructions
1327+
if (!ST->hasVector())
1328+
return true;
1329+
1330+
// Always expand for operands that do not fill one vector reg
1331+
auto *Type = cast<FixedVectorType>(II->getOperand(0)->getType());
1332+
unsigned NumElts = Type->getNumElements();
1333+
unsigned ScalarSize = Type->getScalarSizeInBits();
1334+
unsigned MaxElts = SystemZ::VectorBits / ScalarSize;
1335+
if (NumElts < MaxElts)
1336+
return true;
1337+
1338+
// Otherwise
1339+
switch (II->getIntrinsicID()) {
1340+
// Do not expand vector.reduce.add
1341+
case Intrinsic::vector_reduce_add:
1342+
// Except for i64, since the performance benefit is dubious there
1343+
return ScalarSize >= 64;
1344+
default:
1345+
return true;
1346+
}
1347+
}

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
127127

128128
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
129129
TTI::TargetCostKind CostKind);
130+
131+
bool shouldExpandReduction(const IntrinsicInst *II) const;
130132
/// @}
131133
};
132134

llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,19 @@ define void @reduce(ptr %src, ptr %dst) {
77
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
88
; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
99
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
10-
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
11-
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
12-
; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
13-
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
14-
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
15-
; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
16-
; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
17-
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
18-
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
19-
; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
20-
; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
10+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
11+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
12+
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
13+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
14+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
15+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
16+
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
17+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
18+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
19+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
20+
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
2121
;
22-
; CHECK: Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
22+
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
2323
; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
2424

2525
; REDUCEADD64

0 commit comments

Comments
 (0)