Skip to content

Commit f5d153e

Browse files
authored
[VectorCombine] Fold binary op of reductions. (#121567)
Replace binary of of two reductions with one reduction of the binary op applied to vectors. For example: ``` %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1) %res = add i32 %v0_red, %v1_red ``` gets transformed to: ``` %1 = add <16 x i32> %v0, %v1 %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) ```
1 parent 8d5c1e6 commit f5d153e

File tree

5 files changed

+249
-25
lines changed

5 files changed

+249
-25
lines changed

llvm/include/llvm/Transforms/Utils/LoopUtils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,8 @@ constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);
365365

366366
/// Returns the arithmetic instruction opcode used when expanding a reduction.
367367
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);
368+
/// Returns the reduction intrinsic id corresponding to the binary operation.
369+
Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc);
368370

369371
/// Returns the min/max intrinsic used when expanding a min/max reduction.
370372
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID);

llvm/lib/Transforms/Utils/LoopUtils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,7 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
957957
}
958958
}
959959

960+
// This is the inverse to getReductionForBinop
960961
unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
961962
switch (RdxID) {
962963
case Intrinsic::vector_reduce_fadd:
@@ -986,6 +987,25 @@ unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
986987
}
987988
}
988989

990+
// This is the inverse to getArithmeticReductionInstruction
991+
Intrinsic::ID llvm::getReductionForBinop(Instruction::BinaryOps Opc) {
992+
switch (Opc) {
993+
default:
994+
break;
995+
case Instruction::Add:
996+
return Intrinsic::vector_reduce_add;
997+
case Instruction::Mul:
998+
return Intrinsic::vector_reduce_mul;
999+
case Instruction::And:
1000+
return Intrinsic::vector_reduce_and;
1001+
case Instruction::Or:
1002+
return Intrinsic::vector_reduce_or;
1003+
case Instruction::Xor:
1004+
return Intrinsic::vector_reduce_xor;
1005+
}
1006+
return Intrinsic::not_intrinsic;
1007+
}
1008+
9891009
Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID) {
9901010
switch (RdxID) {
9911011
default:

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class VectorCombine {
114114
bool scalarizeBinopOrCmp(Instruction &I);
115115
bool scalarizeVPIntrinsic(Instruction &I);
116116
bool foldExtractedCmps(Instruction &I);
117+
bool foldBinopOfReductions(Instruction &I);
117118
bool foldSingleElementStore(Instruction &I);
118119
bool scalarizeLoadExtract(Instruction &I);
119120
bool foldConcatOfBoolMasks(Instruction &I);
@@ -1242,6 +1243,121 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
12421243
return true;
12431244
}
12441245

1246+
static void analyzeCostOfVecReduction(const IntrinsicInst &II,
1247+
TTI::TargetCostKind CostKind,
1248+
const TargetTransformInfo &TTI,
1249+
InstructionCost &CostBeforeReduction,
1250+
InstructionCost &CostAfterReduction) {
1251+
Instruction *Op0, *Op1;
1252+
auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1253+
auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1254+
unsigned ReductionOpc =
1255+
getArithmeticReductionInstruction(II.getIntrinsicID());
1256+
if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1257+
bool IsUnsigned = isa<ZExtInst>(RedOp);
1258+
auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1259+
1260+
CostBeforeReduction =
1261+
TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1262+
TTI::CastContextHint::None, CostKind, RedOp);
1263+
CostAfterReduction =
1264+
TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1265+
ExtType, FastMathFlags(), CostKind);
1266+
return;
1267+
}
1268+
if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1269+
match(RedOp,
1270+
m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
1271+
match(Op0, m_ZExtOrSExt(m_Value())) &&
1272+
Op0->getOpcode() == Op1->getOpcode() &&
1273+
Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1274+
(Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1275+
// Matched reduce.add(ext(mul(ext(A), ext(B)))
1276+
bool IsUnsigned = isa<ZExtInst>(Op0);
1277+
auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1278+
VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1279+
1280+
InstructionCost ExtCost =
1281+
TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1282+
TTI::CastContextHint::None, CostKind, Op0);
1283+
InstructionCost MulCost =
1284+
TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1285+
InstructionCost Ext2Cost =
1286+
TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1287+
TTI::CastContextHint::None, CostKind, RedOp);
1288+
1289+
CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1290+
CostAfterReduction =
1291+
TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
1292+
return;
1293+
}
1294+
CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1295+
std::nullopt, CostKind);
1296+
return;
1297+
}
1298+
1299+
bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1300+
Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1301+
Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1302+
if (BinOpOpc == Instruction::Sub)
1303+
ReductionIID = Intrinsic::vector_reduce_add;
1304+
if (ReductionIID == Intrinsic::not_intrinsic)
1305+
return false;
1306+
1307+
auto checkIntrinsicAndGetItsArgument = [](Value *V,
1308+
Intrinsic::ID IID) -> Value * {
1309+
auto *II = dyn_cast<IntrinsicInst>(V);
1310+
if (!II)
1311+
return nullptr;
1312+
if (II->getIntrinsicID() == IID && II->hasOneUse())
1313+
return II->getArgOperand(0);
1314+
return nullptr;
1315+
};
1316+
1317+
Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1318+
if (!V0)
1319+
return false;
1320+
Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1321+
if (!V1)
1322+
return false;
1323+
1324+
auto *VTy = cast<VectorType>(V0->getType());
1325+
if (V1->getType() != VTy)
1326+
return false;
1327+
const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1328+
const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1329+
unsigned ReductionOpc =
1330+
getArithmeticReductionInstruction(II0.getIntrinsicID());
1331+
1332+
InstructionCost OldCost = 0;
1333+
InstructionCost NewCost = 0;
1334+
InstructionCost CostOfRedOperand0 = 0;
1335+
InstructionCost CostOfRed0 = 0;
1336+
InstructionCost CostOfRedOperand1 = 0;
1337+
InstructionCost CostOfRed1 = 0;
1338+
analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1339+
analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1340+
OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1341+
NewCost =
1342+
CostOfRedOperand0 + CostOfRedOperand1 +
1343+
TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1344+
TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1345+
if (NewCost >= OldCost || !NewCost.isValid())
1346+
return false;
1347+
1348+
LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1349+
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1350+
<< "\n");
1351+
Value *VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1352+
if (auto *PDInst = dyn_cast<PossiblyDisjointInst>(&I))
1353+
if (auto *PDVectorBO = dyn_cast<PossiblyDisjointInst>(VectorBO))
1354+
PDVectorBO->setIsDisjoint(PDInst->isDisjoint());
1355+
1356+
Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1357+
replaceValue(I, *Rdx);
1358+
return true;
1359+
}
1360+
12451361
// Check if memory loc modified between two instrs in the same BB
12461362
static bool isMemModifiedBetween(BasicBlock::iterator Begin,
12471363
BasicBlock::iterator End,
@@ -3380,6 +3496,7 @@ bool VectorCombine::run() {
33803496
if (Instruction::isBinaryOp(Opcode)) {
33813497
MadeChange |= foldExtractExtract(I);
33823498
MadeChange |= foldExtractedCmps(I);
3499+
MadeChange |= foldBinopOfReductions(I);
33833500
}
33843501
break;
33853502
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mattr=+mve.fp -passes=vector-combine -S | FileCheck %s
3+
4+
target triple = "thumbv8.1m.main-arm-none-eabi"
5+
6+
define i16 @add_of_reduce_add(<8 x i16> %v0, <8 x i16> %v1) {
7+
; CHECK-LABEL: define i16 @add_of_reduce_add(
8+
; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[V0]], [[V1]]
10+
; CHECK-NEXT: [[RES:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP1]])
11+
; CHECK-NEXT: ret i16 [[RES]]
12+
;
13+
%v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
14+
%v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
15+
%res = add i16 %v0_red, %v1_red
16+
ret i16 %res
17+
}
18+
19+
define i16 @reduce_zext_0(<8 x i8> %v0, <8 x i16> %v1) {
20+
; CHECK-LABEL: define i16 @reduce_zext_0(
21+
; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0]] {
22+
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
23+
; CHECK-NEXT: [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
24+
; CHECK-NEXT: [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V1]])
25+
; CHECK-NEXT: [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
26+
; CHECK-NEXT: ret i16 [[RES]]
27+
;
28+
%zext_ = zext <8 x i8> %v0 to <8 x i16>
29+
%v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
30+
%v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
31+
%res = add i16 %v0_red, %v1_red
32+
ret i16 %res
33+
}
34+
35+
define i16 @reduce_zext_1(<8 x i16> %v0, <8 x i8> %v1) {
36+
; CHECK-LABEL: define i16 @reduce_zext_1(
37+
; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i8> [[V1:%.*]]) #[[ATTR0]] {
38+
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
39+
; CHECK-NEXT: [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V0]])
40+
; CHECK-NEXT: [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
41+
; CHECK-NEXT: [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
42+
; CHECK-NEXT: ret i16 [[RES]]
43+
;
44+
%zext_ = zext <8 x i8> %v1 to <8 x i16>
45+
%v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
46+
%v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
47+
%res = add i16 %v0_red, %v1_red
48+
ret i16 %res
49+
}
50+
51+
define i32 @mul_acc_pattern_0(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
52+
; CHECK-LABEL: define i32 @mul_acc_pattern_0(
53+
; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
54+
; CHECK-NEXT: [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
55+
; CHECK-NEXT: [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
56+
; CHECK-NEXT: [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
57+
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
58+
; CHECK-NEXT: [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
59+
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
60+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED_MUL_ACC_PATTERN]], [[RED]]
61+
; CHECK-NEXT: ret i32 [[RES]]
62+
;
63+
%inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
64+
%inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
65+
%mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
66+
%zext_ = zext <8 x i16> %mul_ to <8 x i32>
67+
%red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
68+
%red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
69+
%res = add i32 %red_mul_acc_pattern, %red
70+
ret i32 %res
71+
}
72+
73+
define i32 @mul_acc_pattern_1(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
74+
; CHECK-LABEL: define i32 @mul_acc_pattern_1(
75+
; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
76+
; CHECK-NEXT: [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
77+
; CHECK-NEXT: [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
78+
; CHECK-NEXT: [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
79+
; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
80+
; CHECK-NEXT: [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
81+
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
82+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED]], [[RED_MUL_ACC_PATTERN]]
83+
; CHECK-NEXT: ret i32 [[RES]]
84+
;
85+
%inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
86+
%inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
87+
%mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
88+
%zext_ = zext <8 x i16> %mul_ to <8 x i32>
89+
%red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
90+
%red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
91+
%res = add i32 %red, %red_mul_acc_pattern
92+
ret i32 %res
93+
}

llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
2+
; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
33

44
define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
55
; CHECK-LABEL: define i32 @add_of_reduce_add(
66
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
7-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
8-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
9-
; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
7+
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
8+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
109
; CHECK-NEXT: ret i32 [[RES]]
1110
;
1211
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
@@ -31,9 +30,8 @@ define i32 @sub_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
3130
define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
3231
; CHECK-LABEL: define i32 @mul_of_reduce_mul(
3332
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
34-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]])
35-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]])
36-
; CHECK-NEXT: [[RES:%.*]] = mul i32 [[V0_RED]], [[V1_RED]]
33+
; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i32> [[V0]], [[V1]]
34+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP1]])
3735
; CHECK-NEXT: ret i32 [[RES]]
3836
;
3937
%v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0)
@@ -45,9 +43,8 @@ define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
4543
define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
4644
; CHECK-LABEL: define i32 @and_of_reduce_and(
4745
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
48-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V0]])
49-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V1]])
50-
; CHECK-NEXT: [[RES:%.*]] = and i32 [[V0_RED]], [[V1_RED]]
46+
; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[V0]], [[V1]]
47+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP1]])
5148
; CHECK-NEXT: ret i32 [[RES]]
5249
;
5350
%v0_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v0)
@@ -59,9 +56,8 @@ define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
5956
define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
6057
; CHECK-LABEL: define i32 @or_of_reduce_or(
6158
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
62-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
63-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
64-
; CHECK-NEXT: [[RES:%.*]] = or i32 [[V0_RED]], [[V1_RED]]
59+
; CHECK-NEXT: [[TMP1:%.*]] = or <16 x i32> [[V0]], [[V1]]
60+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
6561
; CHECK-NEXT: ret i32 [[RES]]
6662
;
6763
%v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
@@ -73,9 +69,8 @@ define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
7369
define i32 @xor_of_reduce_xor(<16 x i32> %v0, <16 x i32> %v1) {
7470
; CHECK-LABEL: define i32 @xor_of_reduce_xor(
7571
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
76-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V0]])
77-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V1]])
78-
; CHECK-NEXT: [[RES:%.*]] = xor i32 [[V0_RED]], [[V1_RED]]
72+
; CHECK-NEXT: [[TMP1:%.*]] = xor <16 x i32> [[V0]], [[V1]]
73+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[TMP1]])
7974
; CHECK-NEXT: ret i32 [[RES]]
8075
;
8176
%v0_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v0)
@@ -161,9 +156,8 @@ define i32 @multiple_use_of_reduction_1(<16 x i32> %v0, <16 x i32> %v1, ptr %p)
161156
define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
162157
; CHECK-LABEL: define i32 @do_not_preserve_overflow_flags(
163158
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
164-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
165-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
166-
; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[V0_RED]], [[V1_RED]]
159+
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
160+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
167161
; CHECK-NEXT: ret i32 [[RES]]
168162
;
169163
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
@@ -175,9 +169,8 @@ define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
175169
define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
176170
; CHECK-LABEL: define i32 @preserve_disjoint_flags(
177171
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
178-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
179-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
180-
; CHECK-NEXT: [[RES:%.*]] = or disjoint i32 [[V0_RED]], [[V1_RED]]
172+
; CHECK-NEXT: [[TMP1:%.*]] = or disjoint <16 x i32> [[V0]], [[V1]]
173+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
181174
; CHECK-NEXT: ret i32 [[RES]]
182175
;
183176
%v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
@@ -189,9 +182,8 @@ define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
189182
define i32 @add_of_reduce_add_vscale(<vscale x 16 x i32> %v0, <vscale x 16 x i32> %v1) {
190183
; CHECK-LABEL: define i32 @add_of_reduce_add_vscale(
191184
; CHECK-SAME: <vscale x 16 x i32> [[V0:%.*]], <vscale x 16 x i32> [[V1:%.*]]) {
192-
; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V0]])
193-
; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V1]])
194-
; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
185+
; CHECK-NEXT: [[TMP1:%.*]] = add <vscale x 16 x i32> [[V0]], [[V1]]
186+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP1]])
195187
; CHECK-NEXT: ret i32 [[RES]]
196188
;
197189
%v0_red = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %v0)

0 commit comments

Comments
 (0)