[VectorCombine] Fold reduce(trunc(x)) -> trunc(reduce(x)) iff cost effective

RKSimon · RKSimon · commit d30236d70b6f · 2024-02-19T11:10:58.000Z
Vector truncations can be pretty expensive, especially on X86, whilst scalar truncations are often free. If the cost of performing the add/mul/and/or/xor reduction is cheap enough on the pre-truncated type, then avoid the vector truncation entirely. Fixes #81469
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include <numeric>
 #include <queue>
 
@@ -111,6 +112,7 @@ class VectorCombine {
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
+  bool foldTruncFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
 
   void replaceValue(Value &Old, Value &New) {
@@ -1526,6 +1528,60 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
   return foldSelectShuffle(*Shuffle, true);
 }
 
+/// Determine if its more efficient to fold:
+///   reduce(trunc(x)) -> trunc(reduce(x)).
+bool VectorCombine::foldTruncFromReductions(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+
+  Intrinsic::ID IID = II->getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
+  Value *ReductionSrc = I.getOperand(0);
+
+  Value *TruncSrc;
+  if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(TruncSrc)))))
+    return false;
+
+  auto *Trunc = cast<CastInst>(ReductionSrc);
+  auto *TruncSrcTy = cast<VectorType>(TruncSrc->getType());
+  auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
+  Type *ResultTy = I.getType();
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost OldCost =
+      TTI.getCastInstrCost(Instruction::Trunc, ReductionSrcTy, TruncSrcTy,
+                           TTI::CastContextHint::None, CostKind, Trunc) +
+      TTI.getArithmeticReductionCost(ReductionOpc, ReductionSrcTy, std::nullopt,
+                                     CostKind);
+  InstructionCost NewCost =
+      TTI.getArithmeticReductionCost(ReductionOpc, TruncSrcTy, std::nullopt,
+                                     CostKind) +
+      TTI.getCastInstrCost(Instruction::Trunc, ResultTy,
+                           ReductionSrcTy->getScalarType(),
+                           TTI::CastContextHint::None, CostKind);
+
+  if (OldCost <= NewCost || !NewCost.isValid())
+    return false;
+
+  Value *NewReduction = Builder.CreateIntrinsic(
+      TruncSrcTy->getScalarType(), II->getIntrinsicID(), {TruncSrc});
+  Value *NewTruncation = Builder.CreateTrunc(NewReduction, ResultTy);
+  replaceValue(I, *NewTruncation);
+  return true;
+}
+
 /// This method looks for groups of shuffles acting on binops, of the form:
 ///  %x = shuffle ...
 ///  %y = shuffle ...
@@ -1917,6 +1973,7 @@ bool VectorCombine::run() {
       switch (Opcode) {
       case Instruction::Call:
         MadeChange |= foldShuffleFromReductions(I);
+        MadeChange |= foldTruncFromReductions(I);
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
diff --git a/llvm/test/Transforms/VectorCombine/X86/reduction-of-truncations.ll b/llvm/test/Transforms/VectorCombine/X86/reduction-of-truncations.ll
@@ -8,24 +8,29 @@
 ; Fold reduce(trunc(X)) -> trunc(reduce(X)) if more cost efficient
 ;
 
-; TODO: Cheap AVX512 v8i64 -> v8i32 truncation
+; Cheap AVX512 v8i64 -> v8i32 truncation
 define i32 @reduce_add_trunc_v8i64_i32(<8 x i64> %a0)  {
-; CHECK-LABEL: @reduce_add_trunc_v8i64_i32(
-; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i64> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TR]])
-; CHECK-NEXT:    ret i32 [[RED]]
+; X64-LABEL: @reduce_add_trunc_v8i64_i32(
+; X64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]])
+; X64-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i32
+; X64-NEXT:    ret i32 [[RED]]
+;
+; AVX512-LABEL: @reduce_add_trunc_v8i64_i32(
+; AVX512-NEXT:    [[TR:%.*]] = trunc <8 x i64> [[A0:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TR]])
+; AVX512-NEXT:    ret i32 [[RED]]
 ;
   %tr = trunc <8 x i64> %a0 to <8 x i32>
   %red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %tr)
   ret i32 %red
 }
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
-; TODO: No legal vXi8 multiplication so vXi16 is always cheaper
+; No legal vXi8 multiplication so vXi16 is always cheaper
 define i8 @reduce_mul_trunc_v16i16_i8(<16 x i16> %a0)  {
 ; CHECK-LABEL: @reduce_mul_trunc_v16i16_i8(
-; CHECK-NEXT:    [[TR:%.*]] = trunc <16 x i16> [[A0:%.*]] to <16 x i8>
-; CHECK-NEXT:    [[RED:%.*]] = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> [[TR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i16 [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[RED]]
 ;
   %tr = trunc <16 x i16> %a0 to <16 x i8>
@@ -36,8 +41,8 @@ declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
 
 define i8 @reduce_or_trunc_v8i32_i8(<8 x i32> %a0)  {
 ; CHECK-LABEL: @reduce_or_trunc_v8i32_i8(
-; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i32> [[A0:%.*]] to <8 x i8>
-; CHECK-NEXT:    [[RED:%.*]] = tail call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i32 [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[RED]]
 ;
   %tr = trunc <8 x i32> %a0 to <8 x i8>
@@ -48,8 +53,8 @@ declare i32 @llvm.vector.reduce.or.v8i8(<8 x i8>)
 
 define i8 @reduce_xor_trunc_v16i64_i8(<16 x i64> %a0)  {
 ; CHECK-LABEL: @reduce_xor_trunc_v16i64_i8(
-; CHECK-NEXT:    [[TR:%.*]] = trunc <16 x i64> [[A0:%.*]] to <16 x i8>
-; CHECK-NEXT:    [[RED:%.*]] = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> [[TR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[RED]]
 ;
   %tr = trunc <16 x i64> %a0 to <16 x i8>
@@ -61,8 +66,8 @@ declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
 ; Truncation source has other uses - OK to truncate reduction
 define i16 @reduce_and_trunc_v16i64_i16(<16 x i64> %a0)  {
 ; CHECK-LABEL: @reduce_and_trunc_v16i64_i16(
-; CHECK-NEXT:    [[TR:%.*]] = trunc <16 x i64> [[A0:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[RED:%.*]] = tail call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> [[TR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> [[A0:%.*]])
+; CHECK-NEXT:    [[RED:%.*]] = trunc i64 [[TMP1]] to i16
 ; CHECK-NEXT:    call void @use_v16i64(<16 x i64> [[A0]])
 ; CHECK-NEXT:    ret i16 [[RED]]
 ;
@@ -116,6 +121,3 @@ define i16 @reduce_and_trunc_v16i64_i16_multiuse(<16 x i64> %a0)  {
 declare void @use_v16i64(<16 x i64>)
 declare void @use_v16i16(<16 x i16>)
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512: {{.*}}
-; X64: {{.*}}