Skip to content

[VectorCombine] Fold binary op of reductions. #121567

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/Transforms/Utils/LoopUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,8 @@ constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);

/// Returns the arithmetic instruction opcode used when expanding a reduction.
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);
/// Returns the reduction intrinsic id corresponding to the binary operation.
Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc);

/// Returns the min/max intrinsic used when expanding a min/max reduction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID);
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,7 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
}
}

// This is the inverse to getReductionForBinop
unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
switch (RdxID) {
case Intrinsic::vector_reduce_fadd:
Expand Down Expand Up @@ -986,6 +987,25 @@ unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
}
}

// This is the inverse to getArithmeticReductionInstruction
Intrinsic::ID llvm::getReductionForBinop(Instruction::BinaryOps Opc) {
switch (Opc) {
default:
break;
case Instruction::Add:
return Intrinsic::vector_reduce_add;
case Instruction::Mul:
return Intrinsic::vector_reduce_mul;
case Instruction::And:
return Intrinsic::vector_reduce_and;
case Instruction::Or:
return Intrinsic::vector_reduce_or;
case Instruction::Xor:
return Intrinsic::vector_reduce_xor;
}
return Intrinsic::not_intrinsic;
}

Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID) {
switch (RdxID) {
default:
Expand Down
117 changes: 117 additions & 0 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ class VectorCombine {
bool scalarizeBinopOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
bool foldExtractedCmps(Instruction &I);
bool foldBinopOfReductions(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
bool foldConcatOfBoolMasks(Instruction &I);
Expand Down Expand Up @@ -1242,6 +1243,121 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
return true;
}

static void analyzeCostOfVecReduction(const IntrinsicInst &II,
TTI::TargetCostKind CostKind,
const TargetTransformInfo &TTI,
InstructionCost &CostBeforeReduction,
InstructionCost &CostAfterReduction) {
Instruction *Op0, *Op1;
auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
unsigned ReductionOpc =
getArithmeticReductionInstruction(II.getIntrinsicID());
if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
bool IsUnsigned = isa<ZExtInst>(RedOp);
auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());

CostBeforeReduction =
TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
TTI::CastContextHint::None, CostKind, RedOp);
CostAfterReduction =
TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
ExtType, FastMathFlags(), CostKind);
return;
}
if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
match(RedOp,
m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
match(Op0, m_ZExtOrSExt(m_Value())) &&
Op0->getOpcode() == Op1->getOpcode() &&
Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
(Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
// Matched reduce.add(ext(mul(ext(A), ext(B)))
bool IsUnsigned = isa<ZExtInst>(Op0);
auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);

InstructionCost ExtCost =
TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
TTI::CastContextHint::None, CostKind, Op0);
InstructionCost MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
InstructionCost Ext2Cost =
TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
TTI::CastContextHint::None, CostKind, RedOp);

CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
CostAfterReduction =
TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
return;
}
CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
std::nullopt, CostKind);
return;
}

bool VectorCombine::foldBinopOfReductions(Instruction &I) {
Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
if (BinOpOpc == Instruction::Sub)
ReductionIID = Intrinsic::vector_reduce_add;
if (ReductionIID == Intrinsic::not_intrinsic)
return false;

auto checkIntrinsicAndGetItsArgument = [](Value *V,
Intrinsic::ID IID) -> Value * {
auto *II = dyn_cast<IntrinsicInst>(V);
if (!II)
return nullptr;
if (II->getIntrinsicID() == IID && II->hasOneUse())
return II->getArgOperand(0);
return nullptr;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be a lot simpler if m_Intrinsic wasn't limited to matching IID as a template arg :(

};

Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
if (!V0)
return false;
Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
if (!V1)
return false;

auto *VTy = cast<VectorType>(V0->getType());
if (V1->getType() != VTy)
return false;
const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
unsigned ReductionOpc =
getArithmeticReductionInstruction(II0.getIntrinsicID());

InstructionCost OldCost = 0;
InstructionCost NewCost = 0;
InstructionCost CostOfRedOperand0 = 0;
InstructionCost CostOfRed0 = 0;
InstructionCost CostOfRedOperand1 = 0;
InstructionCost CostOfRed1 = 0;
analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
NewCost =
CostOfRedOperand0 + CostOfRedOperand1 +
TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
if (NewCost >= OldCost || !NewCost.isValid())
return false;

LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
Value *VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
if (auto *PDInst = dyn_cast<PossiblyDisjointInst>(&I))
if (auto *PDVectorBO = dyn_cast<PossiblyDisjointInst>(VectorBO))
PDVectorBO->setIsDisjoint(PDInst->isDisjoint());

Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
replaceValue(I, *Rdx);
return true;
}

// Check if memory loc modified between two instrs in the same BB
static bool isMemModifiedBetween(BasicBlock::iterator Begin,
BasicBlock::iterator End,
Expand Down Expand Up @@ -3382,6 +3498,7 @@ bool VectorCombine::run() {
if (Instruction::isBinaryOp(Opcode)) {
MadeChange |= foldExtractExtract(I);
MadeChange |= foldExtractedCmps(I);
MadeChange |= foldBinopOfReductions(I);
}
break;
}
Expand Down
93 changes: 93 additions & 0 deletions llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mattr=+mve.fp -passes=vector-combine -S | FileCheck %s

target triple = "thumbv8.1m.main-arm-none-eabi"

define i16 @add_of_reduce_add(<8 x i16> %v0, <8 x i16> %v1) {
; CHECK-LABEL: define i16 @add_of_reduce_add(
; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP1]])
; CHECK-NEXT: ret i16 [[RES]]
;
%v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
%v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
%res = add i16 %v0_red, %v1_red
ret i16 %res
}

define i16 @reduce_zext_0(<8 x i8> %v0, <8 x i16> %v1) {
; CHECK-LABEL: define i16 @reduce_zext_0(
; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
; CHECK-NEXT: [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
; CHECK-NEXT: [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: ret i16 [[RES]]
;
%zext_ = zext <8 x i8> %v0 to <8 x i16>
%v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
%v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
%res = add i16 %v0_red, %v1_red
ret i16 %res
}

define i16 @reduce_zext_1(<8 x i16> %v0, <8 x i8> %v1) {
; CHECK-LABEL: define i16 @reduce_zext_1(
; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i8> [[V1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
; CHECK-NEXT: [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
; CHECK-NEXT: [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: ret i16 [[RES]]
;
%zext_ = zext <8 x i8> %v1 to <8 x i16>
%v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
%v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
%res = add i16 %v0_red, %v1_red
ret i16 %res
}

define i32 @mul_acc_pattern_0(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
; CHECK-LABEL: define i32 @mul_acc_pattern_0(
; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
; CHECK-NEXT: [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
; CHECK-NEXT: [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
; CHECK-NEXT: [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED_MUL_ACC_PATTERN]], [[RED]]
; CHECK-NEXT: ret i32 [[RES]]
;
%inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
%inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
%mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
%zext_ = zext <8 x i16> %mul_ to <8 x i32>
%red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
%red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
%res = add i32 %red_mul_acc_pattern, %red
ret i32 %res
}

define i32 @mul_acc_pattern_1(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
; CHECK-LABEL: define i32 @mul_acc_pattern_1(
; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
; CHECK-NEXT: [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
; CHECK-NEXT: [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
; CHECK-NEXT: [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED]], [[RED_MUL_ACC_PATTERN]]
; CHECK-NEXT: ret i32 [[RES]]
;
%inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
%inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
%mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
%zext_ = zext <8 x i16> %mul_ to <8 x i32>
%red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
%red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
%res = add i32 %red, %red_mul_acc_pattern
ret i32 %res
}
42 changes: 17 additions & 25 deletions llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
; RUN: opt < %s -passes=vector-combine -S | FileCheck %s

define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @add_of_reduce_add(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
Expand All @@ -31,9 +30,8 @@ define i32 @sub_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @mul_of_reduce_mul(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = mul i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0)
Expand All @@ -45,9 +43,8 @@ define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @and_of_reduce_and(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = and i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v0)
Expand All @@ -59,9 +56,8 @@ define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @or_of_reduce_or(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = or i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = or <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
Expand All @@ -73,9 +69,8 @@ define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @xor_of_reduce_xor(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @xor_of_reduce_xor(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = xor i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = xor <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v0)
Expand Down Expand Up @@ -161,9 +156,8 @@ define i32 @multiple_use_of_reduction_1(<16 x i32> %v0, <16 x i32> %v1, ptr %p)
define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @do_not_preserve_overflow_flags(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
Expand All @@ -175,9 +169,8 @@ define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @preserve_disjoint_flags(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = or disjoint i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = or disjoint <16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
Expand All @@ -189,9 +182,8 @@ define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @add_of_reduce_add_vscale(<vscale x 16 x i32> %v0, <vscale x 16 x i32> %v1) {
; CHECK-LABEL: define i32 @add_of_reduce_add_vscale(
; CHECK-SAME: <vscale x 16 x i32> [[V0:%.*]], <vscale x 16 x i32> [[V1:%.*]]) {
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V0]])
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V1]])
; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
; CHECK-NEXT: [[TMP1:%.*]] = add <vscale x 16 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %v0)
Expand Down