Skip to content

Commit 241fe83

Browse files
authored
[VPlan] Introduce ComputeReductionResult VPInstruction opcode. (#70253)
This patch introduces a new ComputeReductionResult opcode to compute the final reduction result in the middle block. The code from fixReduction has been moved to ComputeReductionResult, after some earlier cleanup changes to model parts of fixReduction explicitly elsewhere as needed. The recipe may be broken down further in the future. Note that the phi nodes to merge the reduction result from the trip count check and the middle block, to be used as resume value for the scalar remainder loop are also generated based on ComputeReductionResult. Once we have a VPValue for the reduction result, this can also be modeled explicitly and moved out of the recipe.
1 parent 0414cf0 commit 241fe83

14 files changed

+247
-248
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -346,16 +346,20 @@ class LoopVectorizationPlanner {
346346
/// Return the best VPlan for \p VF.
347347
VPlan &getBestPlanFor(ElementCount VF) const;
348348

349-
/// Generate the IR code for the body of the vectorized loop according to the
350-
/// best selected \p VF, \p UF and VPlan \p BestPlan.
349+
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
350+
/// according to the best selected \p VF and \p UF.
351+
///
351352
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
352353
/// vectorization re-using plans for both the main and epilogue vector loops.
353354
/// It should be removed once the re-use issue has been fixed.
354355
/// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
355-
/// to re-use expansion results generated during main plan execution. Returns
356-
/// a mapping of SCEVs to their expanded IR values. Note that this is a
357-
/// temporary workaround needed due to the current epilogue handling.
358-
DenseMap<const SCEV *, Value *>
356+
/// to re-use expansion results generated during main plan execution.
357+
///
358+
/// Returns a mapping of SCEVs to their expanded IR values and a mapping for
359+
/// the reduction resume values. Note that this is a temporary workaround
360+
/// needed due to the current epilogue handling.
361+
std::pair<DenseMap<const SCEV *, Value *>,
362+
DenseMap<const RecurrenceDescriptor *, Value *>>
359363
executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
360364
InnerLoopVectorizer &LB, DominatorTree *DT,
361365
bool IsEpilogueVectorization,

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 128 additions & 223 deletions
Large diffs are not rendered by default.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
446446
// ExitBB can be re-used for the exit block of the Plan.
447447
NewBB = State->CFG.ExitBB;
448448
State->CFG.PrevBB = NewBB;
449+
State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
449450

450451
// Update the branch instruction in the predecessor to branch to ExitBB.
451452
VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1061,7 +1061,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
10611061
// Increment the canonical IV separately for each unrolled part.
10621062
CanonicalIVIncrementForPart,
10631063
BranchOnCount,
1064-
BranchOnCond
1064+
BranchOnCond,
1065+
ComputeReductionResult,
10651066
};
10661067

10671068
private:
@@ -3132,6 +3133,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
31323133
return Rep->isUniform();
31333134
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
31343135
return all_of(GEP->operands(), isUniformAfterVectorization);
3136+
if (auto *VPI = dyn_cast<VPInstruction>(Def))
3137+
return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
31353138
return false;
31363139
}
31373140
} // end namespace vputils

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "llvm/Support/Debug.h"
2929
#include "llvm/Support/raw_ostream.h"
3030
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
31+
#include "llvm/Transforms/Utils/LoopUtils.h"
3132
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
3233
#include <cassert>
3334

@@ -401,6 +402,84 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
401402
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
402403
return CondBr;
403404
}
405+
case VPInstruction::ComputeReductionResult: {
406+
if (Part != 0)
407+
return State.get(this, 0);
408+
409+
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
410+
// and will be removed by breaking up the recipe further.
411+
auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
412+
auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
413+
// Get its reduction variable descriptor.
414+
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
415+
416+
RecurKind RK = RdxDesc.getRecurrenceKind();
417+
418+
State.setDebugLocFrom(getDebugLoc());
419+
420+
VPValue *LoopExitingDef = getOperand(1);
421+
Type *PhiTy = OrigPhi->getType();
422+
VectorParts RdxParts(State.UF);
423+
for (unsigned Part = 0; Part < State.UF; ++Part)
424+
RdxParts[Part] = State.get(LoopExitingDef, Part);
425+
426+
// If the vector reduction can be performed in a smaller type, we truncate
427+
// then extend the loop exit value to enable InstCombine to evaluate the
428+
// entire expression in the smaller type.
429+
// TODO: Handle this in truncateToMinBW.
430+
if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
431+
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
432+
for (unsigned Part = 0; Part < State.UF; ++Part)
433+
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
434+
}
435+
// Reduce all of the unrolled parts into a single vector.
436+
Value *ReducedPartRdx = RdxParts[0];
437+
unsigned Op = RecurrenceDescriptor::getOpcode(RK);
438+
439+
if (PhiR->isOrdered()) {
440+
ReducedPartRdx = RdxParts[State.UF - 1];
441+
} else {
442+
// Floating-point operations should have some FMF to enable the reduction.
443+
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
444+
Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
445+
for (unsigned Part = 1; Part < State.UF; ++Part) {
446+
Value *RdxPart = RdxParts[Part];
447+
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
448+
ReducedPartRdx = Builder.CreateBinOp(
449+
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
450+
else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
451+
TrackingVH<Value> ReductionStartValue =
452+
RdxDesc.getRecurrenceStartValue();
453+
ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
454+
ReducedPartRdx, RdxPart);
455+
} else
456+
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
457+
}
458+
}
459+
460+
// Create the reduction after the loop. Note that inloop reductions create
461+
// the target reduction in the loop using a Reduction recipe.
462+
if (State.VF.isVector() && !PhiR->isInLoop()) {
463+
ReducedPartRdx =
464+
createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
465+
// If the reduction can be performed in a smaller type, we need to extend
466+
// the reduction to the wider type before we branch to the original loop.
467+
if (PhiTy != RdxDesc.getRecurrenceType())
468+
ReducedPartRdx = RdxDesc.isSigned()
469+
? Builder.CreateSExt(ReducedPartRdx, PhiTy)
470+
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
471+
}
472+
473+
// If there were stores of the reduction value to a uniform memory address
474+
// inside the loop, create the final store here.
475+
if (StoreInst *SI = RdxDesc.IntermediateStore) {
476+
auto *NewSI = Builder.CreateAlignedStore(
477+
ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
478+
propagateMetadata(NewSI, SI);
479+
}
480+
481+
return ReducedPartRdx;
482+
}
404483
default:
405484
llvm_unreachable("Unsupported opcode for instruction");
406485
}
@@ -477,6 +556,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
477556
case VPInstruction::BranchOnCount:
478557
O << "branch-on-count";
479558
break;
559+
case VPInstruction::ComputeReductionResult:
560+
O << "compute-reduction-result";
561+
break;
480562
default:
481563
O << Instruction::getOpcodeName(getOpcode());
482564
}

llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -612,8 +612,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
612612
; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
613613
; CHECK-UNORDERED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
614614
; CHECK-UNORDERED: middle.block:
615-
; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP14]])
616615
; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP15]])
616+
; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP14]])
617617
; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
618618
; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
619619
; CHECK-UNORDERED: scalar.ph:

llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
285285
; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]]
286286
; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
287287
; CHECK-UNORDERED: middle.block
288-
; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
289288
; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
289+
; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
290290
; CHECK-UNORDERED: for.body
291291
; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
292292
; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}}

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -821,8 +821,8 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 {
821821
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
822822
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
823823
; CHECK: middle.block:
824-
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP4]])
825824
; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP5]])
825+
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP4]])
826826
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
827827
; CHECK: scalar.ph:
828828
; CHECK-NEXT: br label [[FOR_BODY:%.*]]

llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ define arm_aapcs_vfpcc i32 @minmaxval4(ptr nocapture readonly %x, ptr nocapture
2626
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
2727
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
2828
; CHECK: middle.block:
29-
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
3029
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
30+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
3131
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
3232
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
3333
; CHECK: scalar.ph:

llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,8 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) {
313313
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
314314
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
315315
; CHECK: middle.block:
316-
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
317316
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
317+
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
318318
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
319319
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
320320
; CHECK: vec.epilog.iter.check:
@@ -344,8 +344,8 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) {
344344
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
345345
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
346346
; CHECK: vec.epilog.middle.block:
347-
; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]])
348347
; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP14]])
348+
; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]])
349349
; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
350350
; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
351351
; CHECK: vec.epilog.scalar.ph:

llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,10 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
215215
; CHECK-NEXT: Successor(s): middle.block
216216
; CHECK-EMPTY:
217217
; CHECK-NEXT: middle.block:
218+
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
218219
; CHECK-NEXT: No successors
219220
; CHECK-EMPTY:
220-
; CHECK-NEXT: Live-out i32 %res = vp<[[SEL]]>
221+
; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
221222
; CHECK-NEXT: }
222223
;
223224
entry:

llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -837,8 +837,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
837837
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
838838
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
839839
; CHECK: middle.block:
840-
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
841840
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
841+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
842842
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
843843
; CHECK: scalar.ph:
844844
; CHECK-NEXT: br label [[FOR_BODY:%.*]]

llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -562,10 +562,10 @@ exit: ; preds = %for.body
562562
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
563563
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
564564
; CHECK: middle.block:
565-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
566-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
567565
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
568566
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
567+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
568+
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
569569
;
570570
entry:
571571
br label %for.body
@@ -591,10 +591,10 @@ exit:
591591
define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
592592
; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
593593
; CHECK: middle.block:
594-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
595-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
596594
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
597595
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
596+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
597+
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
598598
;
599599
entry:
600600
br label %for.body
@@ -621,10 +621,10 @@ exit:
621621
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
622622
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
623623
; CHECK: middle.block:
624-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
625-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
626624
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
627625
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
626+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
627+
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
628628
;
629629
entry:
630630
br label %for.body
@@ -650,10 +650,10 @@ exit:
650650
define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
651651
; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
652652
; CHECK: middle.block:
653-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
654-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
655653
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
656654
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
655+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
656+
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
657657
;
658658
entry:
659659
br label %for.body

llvm/test/Transforms/LoopVectorize/vplan-printing.ll

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
137137
; CHECK-NEXT: Successor(s): middle.block
138138
; CHECK-EMPTY:
139139
; CHECK-NEXT: middle.block:
140+
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
140141
; CHECK-NEXT: No successors
141142
; CHECK-EMPTY:
142-
; CHECK-NEXT: Live-out float %red.next.lcssa = ir<%red.next>
143+
; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]>
143144
; CHECK-NEXT: }
144145
;
145146
entry:
@@ -185,6 +186,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
185186
; CHECK-NEXT: Successor(s): middle.block
186187
; CHECK-EMPTY:
187188
; CHECK-NEXT: middle.block:
189+
; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
188190
; CHECK-NEXT: No successors
189191
; CHECK-NEXT: }
190192
;
@@ -385,9 +387,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
385387
; CHECK-NEXT: Successor(s): middle.block
386388
; CHECK-EMPTY:
387389
; CHECK-NEXT: middle.block:
390+
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]>
388391
; CHECK-NEXT: No successors
389392
; CHECK-EMPTY:
390-
; CHECK-NEXT: Live-out float %muladd.lcssa = ir<%muladd>
393+
; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]>
391394
; CHECK-NEXT:}
392395

393396
entry:

0 commit comments

Comments
 (0)