SiPearl
diff --git a/‎llvm/include/llvm/IR/Attributes.h
Lines changed: 1 addition & 0 deletions b/‎llvm/include/llvm/IR/Attributes.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/CodeGen/ExpandReductions.cpp
Lines changed: 239 additions & 10 deletions b/‎llvm/lib/CodeGen/ExpandReductions.cpp
Lines changed: 239 additions & 10 deletions
diff --git a/‎llvm/lib/IR/Attributes.cpp
Lines changed: 8 additions & 0 deletions b/‎llvm/lib/IR/Attributes.cpp
Lines changed: 8 additions & 0 deletions
@@ -433,6 +433,7 @@ class AttributeSet {
       const;
   unsigned getVScaleRangeMin() const;
   std::optional<unsigned> getVScaleRangeMax() const;
+  std::optional<unsigned> getFixedVScale() const;
   UWTableKind getUWTableKind() const;
   AllocFnKind getAllocKind() const;
   MemoryEffects getMemoryEffects() const;
 
@@ -12,22 +12,205 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 using namespace llvm;
 
 namespace {
 
-bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
-  bool Changed = false;
+void updateDomTreeForScalableExpansion(DominatorTree *DT, BasicBlock *Preheader,
+                                       BasicBlock *Loop, BasicBlock *Exit) {
+  DT->addNewBlock(Loop, Preheader);
+  DT->changeImmediateDominator(Exit, Loop);
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+}
+
+/// Expand a reduction on a scalable vector into a loop
+/// that iterates over one element after the other.
+Value *expandScalableReduction(IRBuilderBase &Builder, IntrinsicInst *II,
+                               Value *Acc, Value *Vec,
+                               Instruction::BinaryOps BinOp,
+                               DominatorTree *DT) {
+  ScalableVectorType *VecTy = cast<ScalableVectorType>(Vec->getType());
+
+  // Split the original BB in two and create a new BB between them,
+  // which will be a loop.
+  BasicBlock *BeforeBB = II->getParent();
+  BasicBlock *AfterBB = SplitBlock(BeforeBB, II, DT);
+  BasicBlock *LoopBB = BasicBlock::Create(Builder.getContext(), "rdx.loop",
+                                          BeforeBB->getParent(), AfterBB);
+  BeforeBB->getTerminator()->setSuccessor(0, LoopBB);
+
+  // Calculate the number of elements in the vector:
+  Builder.SetInsertPoint(BeforeBB->getTerminator());
+  Value *NumElts =
+      Builder.CreateVScale(Builder.getInt64(VecTy->getMinNumElements()));
+
+  // Create two PHIs, one for the index of the current lane and one for
+  // the actuall reduction.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *IV = Builder.CreatePHI(Builder.getInt64Ty(), 2, "index");
+  IV->addIncoming(Builder.getInt64(0), BeforeBB);
+  PHINode *RdxPhi = Builder.CreatePHI(VecTy->getScalarType(), 2, "rdx.phi");
+  RdxPhi->addIncoming(Acc, BeforeBB);
+
+  Value *IVInc =
+      Builder.CreateAdd(IV, Builder.getInt64(1), "index.next", true, true);
+  IV->addIncoming(IVInc, LoopBB);
+
+  // Extract the value at the current lane from the vector and perform
+  // the scalar reduction binop:
+  Value *Lane = Builder.CreateExtractElement(Vec, IV, "elm");
+  Value *Rdx = Builder.CreateBinOp(BinOp, RdxPhi, Lane, "rdx");
+  RdxPhi->addIncoming(Rdx, LoopBB);
+
+  // Exit when all lanes have been treated (assuming there will be at least
+  // one element in the vector):
+  Value *Done = Builder.CreateCmp(CmpInst::ICMP_EQ, IVInc, NumElts, "exitcond");
+  Builder.CreateCondBr(Done, AfterBB, LoopBB);
+
+  if (DT)
+    updateDomTreeForScalableExpansion(DT, BeforeBB, LoopBB, AfterBB);
+
+  return Rdx;
+}
+
+/// Expand a reduction on a scalable vector in a parallel-tree like
+/// manner, meaning halving the number of elements to treat in every
+/// iteration.
+Value *expandScalableTreeReduction(
+    IRBuilderBase &Builder, IntrinsicInst *II, std::optional<Value *> Acc,
+    Value *Vec, Instruction::BinaryOps BinOp,
+    function_ref<bool(Constant *)> IsNeutralElement, DominatorTree *DT,
+    std::optional<unsigned> FixedVScale) {
+  ScalableVectorType *VecTy = cast<ScalableVectorType>(Vec->getType());
+  ScalableVectorType *VecTyX2 = ScalableVectorType::get(
+      VecTy->getScalarType(), VecTy->getMinNumElements() * 2);
+
+  // If the VScale is fixed, do not generate a loop, and instead to
+  // something similar to llvm::getShuffleReduction(). That function
+  // cannot be used directly because it uses shuffle masks, which
+  // are not avaiable for scalable vectors (even if vscale is fixed).
+  // The approach is effectively the same.
+  if (FixedVScale.has_value()) {
+    unsigned VF = VecTy->getMinNumElements() * FixedVScale.value();
+    assert(isPowerOf2_64(VF));
+    for (unsigned I = VF; I != 1; I >>= 1) {
+      Value *Extended = Builder.CreateInsertVector(
+          VecTyX2, PoisonValue::get(VecTyX2), Vec, Builder.getInt64(0));
+      Value *Pair = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
+                                            {VecTyX2}, {Extended});
+
+      Value *Vec1 = Builder.CreateExtractValue(Pair, {0});
+      Value *Vec2 = Builder.CreateExtractValue(Pair, {1});
+      Vec = Builder.CreateBinOp(BinOp, Vec1, Vec2, "rdx");
+    }
+    Value *FinalVal = Builder.CreateExtractElement(Vec, uint64_t(0));
+    if (Acc)
+      if (auto *C = dyn_cast<Constant>(*Acc); !C || !IsNeutralElement(C))
+        FinalVal = Builder.CreateBinOp(BinOp, *Acc, FinalVal, "rdx.final");
+    return FinalVal;
+  }
+
+  // Split the original BB in two and create a new BB between them,
+  // which will be a loop.
+  BasicBlock *BeforeBB = II->getParent();
+  BasicBlock *AfterBB = SplitBlock(BeforeBB, II, DT);
+  BasicBlock *LoopBB = BasicBlock::Create(Builder.getContext(), "rdx.loop",
+                                          BeforeBB->getParent(), AfterBB);
+  BeforeBB->getTerminator()->setSuccessor(0, LoopBB);
+
+  // This tree reduction only needs to do log2(N) iterations.
+  // Note: Calculating log2(N) using count-trailing-zeros (cttz) only works if
+  // `vscale` is a power-of-two. This is the case for every architecture known
+  // right now, but could a check be added with a fallback to some other algo.?
+  assert(isPowerOf2_64(VecTy->getMinNumElements()));
+  Builder.SetInsertPoint(BeforeBB->getTerminator());
+  Value *NumElts =
+      Builder.CreateVScale(Builder.getInt64(VecTy->getMinNumElements()));
+  Value *NumIters = Builder.CreateIntrinsic(NumElts->getType(), Intrinsic::cttz,
+                                            {NumElts, Builder.getTrue()});
+
+  // Create two PHIs, one for the IV and one for the actuall reduction.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *IV = Builder.CreatePHI(Builder.getInt64Ty(), 2, "iter");
+  IV->addIncoming(Builder.getInt64(0), BeforeBB);
+  PHINode *VecPhi = Builder.CreatePHI(VecTy, 2, "rdx.phi");
+  VecPhi->addIncoming(Vec, BeforeBB);
+
+  // Note that instead of calculating log2(N) beforehand and having the IV
+  // increment by one every iteration, we could also have a IV more similar to:
+  //   for (size_t active_lanes = N; active_lanes > 1; active_lanes /= 2) ...
+  // The IV is only used for the loop's exit condition, so how it is
+  // calculated does not matter to the tree reduction.
+  Value *IVInc =
+      Builder.CreateAdd(IV, Builder.getInt64(1), "iter.next", true, true);
+  IV->addIncoming(IVInc, LoopBB);
+
+  // The deinterleave intrinsic takes a vector of, for example, type
+  // <vscale x 8 x float> and produces a pair of vectors with half the size,
+  // so 2 x <vscale x 4 x float>. An insert vector operation is used to
+  // create a double-sized vector where the upper half is poison, because
+  // we never care about that upper half anyways!
+  Value *Extended = Builder.CreateInsertVector(
+      VecTyX2, PoisonValue::get(VecTyX2), VecPhi, Builder.getInt64(0));
+  Value *Pair = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
+                                        {VecTyX2}, {Extended});
+
+  // Take the two vectors and multiply them together. Note that in the first
+  // iteration, the results of 1/2 of the lanes is used, in the second one
+  // 1/4, in the thrid one 1/8, etc.. It could be nice to create a mask
+  // for this? However, on SVE at least, the instr. latency does not depend
+  // on the number of active lanes (as far as I know), so this might just
+  // be useless.
+  Value *Vec1 = Builder.CreateExtractValue(Pair, {0});
+  Value *Vec2 = Builder.CreateExtractValue(Pair, {1});
+  Value *Rdx = Builder.CreateBinOp(BinOp, Vec1, Vec2, "rdx");
+  VecPhi->addIncoming(Rdx, LoopBB);
+
+  // Reduction-loop exit condition:
+  Value *Done =
+      Builder.CreateCmp(CmpInst::ICMP_EQ, IVInc, NumIters, "exitcond");
+  Builder.CreateCondBr(Done, AfterBB, LoopBB);
+  Builder.SetInsertPoint(AfterBB, AfterBB->getFirstInsertionPt());
+  Value *FinalVal = Builder.CreateExtractElement(Rdx, uint64_t(0));
+
+  // If the Acc value is not the neutral element of the reduction operation,
+  // then we need to do the binop one last time with the end result of the
+  // tree reduction. Sidenote: LLVM's loop-vectorizer will actually generate
+  // code where Acc is zero for addition and one for multiplication most of
+  // the time.
+  if (Acc)
+    if (auto *C = dyn_cast<Constant>(*Acc); !C || !IsNeutralElement(C))
+      FinalVal = Builder.CreateBinOp(BinOp, *Acc, FinalVal, "rdx.final");
+
+  if (DT)
+    updateDomTreeForScalableExpansion(DT, BeforeBB, LoopBB, AfterBB);
+
+  return FinalVal;
+}
+
+std::pair<bool, bool> expandReductions(Function &F,
+                                       const TargetTransformInfo *TTI,
+                                       DominatorTree *DT) {
+  bool Changed = false, CFGChanged = false;
   SmallVector<IntrinsicInst *, 4> Worklist;
   for (auto &I : instructions(F)) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
@@ -54,6 +237,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     }
   }
 
+  std::optional<unsigned> FixedVScale =
+      F.getAttributes().getFnAttrs().getFixedVScale();
+
   for (auto *II : Worklist) {
     FastMathFlags FMF =
         isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
@@ -74,7 +260,31 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
       // and it can't be handled by generating a shuffle sequence.
       Value *Acc = II->getArgOperand(0);
       Value *Vec = II->getArgOperand(1);
-      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
+      auto RdxOpcode =
+          Instruction::BinaryOps(getArithmeticReductionInstruction(ID));
+
+      bool ScalableTy = Vec->getType()->isScalableTy();
+      if (ScalableTy && (!FixedVScale || FMF.allowReassoc())) {
+        CFGChanged |= !FixedVScale;
+        if (FMF.allowReassoc())
+          Rdx = expandScalableTreeReduction(
+              Builder, II, Acc, Vec, RdxOpcode,
+              [&](Constant *C) {
+                switch (ID) {
+                case Intrinsic::vector_reduce_fadd:
+                  return C->isZeroValue();
+                case Intrinsic::vector_reduce_fmul:
+                  return C->isOneValue();
+                default:
+                  llvm_unreachable("Binop not handled");
+                }
+              },
+              DT, FixedVScale);
+        else
+          Rdx = expandScalableReduction(Builder, II, Acc, Vec, RdxOpcode, DT);
+        break;
+      }
+
       if (!FMF.allowReassoc())
         Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
       else {
@@ -125,10 +335,22 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     case Intrinsic::vector_reduce_umax:
     case Intrinsic::vector_reduce_umin: {
       Value *Vec = II->getArgOperand(0);
+      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
+      if (Vec->getType()->isScalableTy()) {
+        CFGChanged |= !FixedVScale;
+        Rdx = expandScalableTreeReduction(
+            Builder, II, std::nullopt, Vec, Instruction::BinaryOps(RdxOpcode),
+            [](Constant *C) -> bool {
+              llvm_unreachable(
+                  "No accumulator, so this should never be called!");
+            },
+            DT, FixedVScale);
+        break;
+      }
+
       if (!isPowerOf2_32(
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
         continue;
-      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
       Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
@@ -150,7 +372,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     II->eraseFromParent();
     Changed = true;
   }
-  return Changed;
+  return {CFGChanged, Changed};
 }
 
 class ExpandReductions : public FunctionPass {
@@ -161,13 +383,15 @@ class ExpandReductions : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return expandReductions(F, TTI);
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *DTA = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    return expandReductions(F, TTI, DTA ? &DTA->getDomTree() : nullptr).second;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.setPreservesCFG();
+    AU.addUsedIfAvailable<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
   }
 };
 }
@@ -186,9 +410,14 @@ FunctionPass *llvm::createExpandReductionsPass() {
 PreservedAnalyses ExpandReductionsPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  if (!expandReductions(F, &TTI))
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto [CFGChanged, Changed] = expandReductions(F, &TTI, DT);
+  if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (!CFGChanged)
+    PA.preserveSet<CFGAnalyses>();
+  else
+    PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
@@ -1158,6 +1158,14 @@ std::optional<unsigned> AttributeSet::getVScaleRangeMax() const {
   return SetNode ? SetNode->getVScaleRangeMax() : std::nullopt;
 }
 
+std::optional<unsigned> AttributeSet::getFixedVScale() const {
+  unsigned Min = getVScaleRangeMin();
+  std::optional<unsigned> Max = getVScaleRangeMax();
+  if (Min != 0 && Max.has_value() && Max.value() == Min)
+    return Min;
+  return std::nullopt;
+}
+
 UWTableKind AttributeSet::getUWTableKind() const {
   return SetNode ? SetNode->getUWTableKind() : UWTableKind::None;
 }
Original file line number	Diff line number	Diff line change
`@@ -1158,6 +1158,14 @@ std::optional<unsigned> AttributeSet::getVScaleRangeMax() const {`
`1158`	`1158`	`return SetNode ? SetNode->getVScaleRangeMax() : std::nullopt;`
`1159`	`1159`	`}`
`1160`	`1160`
	`1161`	`+std::optional<unsigned> AttributeSet::getFixedVScale() const {`
	`1162`	`+ unsigned Min = getVScaleRangeMin();`
	`1163`	`+ std::optional<unsigned> Max = getVScaleRangeMax();`
	`1164`	`+ if (Min != 0 && Max.has_value() && Max.value() == Min)`
	`1165`	`+ return Min;`
	`1166`	`+ return std::nullopt;`
	`1167`	`+}`
	`1168`	`+`
`1161`	`1169`	`UWTableKind AttributeSet::getUWTableKind() const {`
`1162`	`1170`	`return SetNode ? SetNode->getUWTableKind() : UWTableKind::None;`
`1163`	`1171`	`}`