llvm · uweigand · Apr 12, 2024 · Mar 5, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -16,13 +16,15 @@
 #include "SystemZMachineFunctionInfo.h"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsS390.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include <cctype>
 #include <optional>
@@ -444,6 +446,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::ROTL, VT, Custom);
 
+      // Add ISD::VECREDUCE_ADD as custom in order to implement
+      // it with VZERO+VSUM
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+
       // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
       // and inverting the result as necessary.
       setOperationAction(ISD::SETCC, VT, Custom);
@@ -6133,6 +6139,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerOR(Op, DAG);
   case ISD::CTPOP:
     return lowerCTPOP(Op, DAG);
+  case ISD::VECREDUCE_ADD:
+    return lowerVECREDUCE_ADD(Op, DAG);
   case ISD::ATOMIC_FENCE:
     return lowerATOMIC_FENCE(Op, DAG);
   case ISD::ATOMIC_SWAP:
@@ -9505,3 +9513,38 @@ SDValue SystemZTargetLowering::lowerGET_ROUNDING(SDValue Op,
 
   return DAG.getMergeValues({RetVal, Chain}, dl);
 }
+
+SDValue SystemZTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  Op = Op.getOperand(0);
+  EVT OpVT = Op.getValueType();
+
+  assert(OpVT.isVector() && "Operand type for VECREDUCE_ADD is not a vector.");
+
+  SDLoc DL(Op);
+
+  // load a 0 vector for the third operand of VSUM.
+  SDValue Zero = DAG.getSplatBuildVector(OpVT, DL, DAG.getConstant(0, DL, VT));
+
+  // execute VSUM.
+  switch (OpVT.getScalarSizeInBits()) {
+  case 8:
+  case 16:
+    Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Zero);
+    LLVM_FALLTHROUGH;
+  case 32:
+  case 64:
+    Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::i128, Op,
+                     DAG.getBitcast(Op.getValueType(), Zero));
+    break;
+  case 128:
+    break; // VSUM over v1i128 should not happen and would be a noop
+  default:
+    llvm_unreachable("Unexpected scalar size.");
+  }
+  // Cast to original vector type, retrieve last element.
+  return DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(OpVT, Op),
+      DAG.getConstant(OpVT.getVectorNumElements() - 1, DL, MVT::i32));
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -691,6 +691,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -1295,18 +1296,14 @@ getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
   if (ID == Intrinsic::vector_reduce_add) {
     // Retrieve number and size of elements for the vector op.
     auto *VTy = cast<FixedVectorType>(ParamTys.front());
-    unsigned NumElements = VTy->getNumElements();
     unsigned ScalarSize = VTy->getScalarSizeInBits();
     // For scalar sizes >128 bits, we fall back to the generic cost estimate.
     if (ScalarSize > SystemZ::VectorBits)
       return -1;
-    // A single vector register can hold this many elements.
-    unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
     // This many vector regs are needed to represent the input elements (V).
     unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
     // This many instructions are needed for the final sum of vector elems (S).
-    unsigned LastVectorHandling =
-        2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
+    unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
     // We use vector adds to create a sum vector, which takes
     // V/2 + V/4 + ... = V - 1 operations.
     // Then, we need S operations to sum up the elements of that sum vector,
@@ -1326,3 +1323,27 @@ SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return Cost;
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
+
+bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+  // Always expand on Subtargets without vector instructions
+  if (!ST->hasVector())
+    return true;
+
+  // Always expand for operands that do not fill one vector reg
+  auto *Type = cast<FixedVectorType>(II->getOperand(0)->getType());
+  unsigned NumElts = Type->getNumElements();
+  unsigned ScalarSize = Type->getScalarSizeInBits();
+  unsigned MaxElts = SystemZ::VectorBits / ScalarSize;
+  if (NumElts < MaxElts)
+    return true;
+
+  // Otherwise
+  switch (II->getIntrinsicID()) {
+  // Do not expand vector.reduce.add
+  case Intrinsic::vector_reduce_add:
+    // Except for i64, since the performance benefit is dubious there
+    return ScalarSize >= 64;
+  default:
+    return true;
+  }
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -126,6 +126,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const;
   /// @}
 };
 

diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
@@ -7,19 +7,19 @@ define void @reduce(ptr %src, ptr %dst) {
 ; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
 ; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
 ; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
-; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
-; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
-; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
-; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
-; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
-; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
 ;
-; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
 ; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
 
   ; REDUCEADD64
-Original file line number
+Diff line change
@@ Expand Up @@
       InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                             TTI::TargetCostKind CostKind);
+      bool shouldExpandReduction(const IntrinsicInst *II) const;
       /// @}
     };
@@ Expand Down @@