Skip to content

Commit 1dbdf29

Browse files
committed
[LoopVectorizer][AArch64] Move getMinTripCountTailFoldingThreshold later.
This moves the checks of MinTripCountTailFoldingThreshold later, during the calculation of whether to tail fold. This allows it to check beforehand whether tail predication is required, either for scalable or fixed-width vectors. This option is only specified for AArch64, where it returns the minimum of 5. This patch aims to allow the vectorization of TC=4 loops, preventing them from performing slower when SVE is present.
1 parent 97eb4af commit 1dbdf29

File tree

2 files changed

+136
-121
lines changed

2 files changed

+136
-121
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4027,11 +4027,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40274027
MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
40284028
}
40294029

4030-
if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4031-
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4032-
"MaxFixedVF must be a power of 2");
4033-
unsigned MaxVFtimesIC =
4034-
UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4030+
auto IsKnownModTripCountZero = [this, &UserIC](unsigned MaxVF) {
4031+
unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
40354032
ScalarEvolution *SE = PSE.getSE();
40364033
// Currently only loops with countable exits are vectorized, but calling
40374034
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
@@ -4045,13 +4042,40 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40454042
const SCEV *Rem = SE->getURemExpr(
40464043
SE->applyLoopGuards(ExitCount, TheLoop),
40474044
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4048-
if (Rem->isZero()) {
4045+
return Rem->isZero();
4046+
};
4047+
4048+
if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4049+
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4050+
"MaxFixedVF must be a power of 2");
4051+
if (IsKnownModTripCountZero(*MaxPowerOf2RuntimeVF)) {
40494052
// Accept MaxFixedVF if we do not have a tail.
40504053
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
40514054
return MaxFactors;
40524055
}
40534056
}
40544057

4058+
if (MaxTC && MaxTC <= TTI.getMinTripCountTailFoldingThreshold()) {
4059+
if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4060+
// If we have a low-trip-count, and the fixed-width VF is known to divide
4061+
// the trip count but the scalable factor does not, use the fixed-width
4062+
// factor in preference to allow the generation of a non-predicated loop.
4063+
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
4064+
IsKnownModTripCountZero(MaxFactors.FixedVF.getFixedValue())) {
4065+
LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
4066+
"remain for any chosen VF.\n");
4067+
MaxFactors.ScalableVF = ElementCount::getScalable(0);
4068+
return MaxFactors;
4069+
}
4070+
}
4071+
4072+
reportVectorizationFailure(
4073+
"The trip count is below the minial threshold value.",
4074+
"loop trip count is too low, avoiding vectorization", "LowTripCount",
4075+
ORE, TheLoop);
4076+
return FixedScalableVFPair::getNone();
4077+
}
4078+
40554079
// If we don't know the precise trip count, or if the trip count that we
40564080
// found modulo the vectorization factor is not zero, try to fold the tail
40574081
// by masking.
@@ -10595,26 +10619,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1059510619
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
1059610620
LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
1059710621
else {
10598-
if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10599-
LLVM_DEBUG(dbgs() << "\n");
10600-
// Predicate tail-folded loops are efficient even when the loop
10601-
// iteration count is low. However, setting the epilogue policy to
10602-
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10603-
// with runtime checks. It's more effective to let
10604-
// `isOutsideLoopWorkProfitable` determine if vectorization is
10605-
// beneficial for the loop.
10606-
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10607-
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10608-
} else {
10609-
LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10610-
"small to consider vectorizing.\n");
10611-
reportVectorizationFailure(
10612-
"The trip count is below the minial threshold value.",
10613-
"loop trip count is too low, avoiding vectorization",
10614-
"LowTripCount", ORE, L);
10615-
Hints.emitRemarkWithHints();
10616-
return false;
10617-
}
10622+
LLVM_DEBUG(dbgs() << "\n");
10623+
// Predicate tail-folded loops are efficient even when the loop
10624+
// iteration count is low. However, setting the epilogue policy to
10625+
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10626+
// with runtime checks. It's more effective to let
10627+
// `isOutsideLoopWorkProfitable` determine if vectorization is
10628+
// beneficial for the loop.
10629+
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10630+
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
1061810631
}
1061910632
}
1062010633

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll

Lines changed: 97 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ target triple = "aarch64-unknown-linux-gnu"
1818

1919
; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
2020
; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
21-
; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value..
21+
; DEBUG: LV: Not vectorizing: Runtime SCEV check is required with -Os/-Oz.
2222

2323
; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks'
2424
; DEBUG: LV: Found trip count: 0
@@ -482,115 +482,117 @@ while.end:
482482
define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
483483
; CHECK-LABEL: define i32 @tc4(
484484
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
485-
; CHECK-NEXT: [[ENTRY:.*]]:
485+
; CHECK-NEXT: [[ENTRY:.*:]]
486486
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 16
487487
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 32
488488
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 48
489489
; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 64
490490
; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 80
491491
; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 96
492492
; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 112
493-
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
494-
; CHECK: [[FOR_COND_CLEANUP:.*]]:
495-
; CHECK-NEXT: [[ADD89_LCSSA:%.*]] = phi i32 [ [[ADD89:%.*]], %[[FOR_BODY]] ]
496-
; CHECK-NEXT: ret i32 [[ADD89_LCSSA]]
497-
; CHECK: [[FOR_BODY]]:
498-
; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
499-
; CHECK-NEXT: [[SUM_0179:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD89]], %[[FOR_BODY]] ]
493+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
500494
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[TMP0]]
501-
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
495+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
496+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
502497
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX2]], i64 0, i64 [[TMP0]]
503-
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4
504-
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP10]], [[TMP8]]
505-
; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[TMP8]], [[TMP10]]
498+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
499+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
500+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
501+
; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
506502
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[TMP0]]
507-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP7]], align 4
503+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
504+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
508505
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[TMP0]]
509-
; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
510-
; CHECK-NEXT: [[ADD17:%.*]] = add i32 [[TMP11]], [[TMP2]]
511-
; CHECK-NEXT: [[SUB24:%.*]] = sub i32 [[TMP2]], [[TMP11]]
512-
; CHECK-NEXT: [[ADD25:%.*]] = add i32 [[ADD17]], [[ADD]]
513-
; CHECK-NEXT: [[SUB26:%.*]] = sub i32 [[ADD]], [[ADD17]]
514-
; CHECK-NEXT: [[ADD27:%.*]] = add i32 [[SUB24]], [[SUB]]
515-
; CHECK-NEXT: [[SUB28:%.*]] = sub i32 [[SUB]], [[SUB24]]
506+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP9]], i32 0
507+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
508+
; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
509+
; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
510+
; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP11]], [[TMP5]]
511+
; CHECK-NEXT: [[TMP14:%.*]] = sub <4 x i32> [[TMP5]], [[TMP11]]
512+
; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
513+
; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP6]], [[TMP12]]
516514
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX30]], i64 0, i64 [[TMP0]]
517-
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP17]], align 4
515+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
516+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4
518517
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX33]], i64 0, i64 [[TMP0]]
519-
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP19]], align 4
520-
; CHECK-NEXT: [[ADD36:%.*]] = add i32 [[TMP5]], [[TMP4]]
521-
; CHECK-NEXT: [[SUB44:%.*]] = sub i32 [[TMP4]], [[TMP5]]
518+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP19]], i32 0
519+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
520+
; CHECK-NEXT: [[TMP21:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD4]]
521+
; CHECK-NEXT: [[TMP22:%.*]] = sub <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
522522
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX46]], i64 0, i64 [[TMP0]]
523-
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP23]], align 4
523+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
524+
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4
524525
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX49]], i64 0, i64 [[TMP0]]
525-
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP25]], align 4
526-
; CHECK-NEXT: [[ADD52:%.*]] = add i32 [[TMP12]], [[TMP6]]
527-
; CHECK-NEXT: [[SUB60:%.*]] = sub i32 [[TMP6]], [[TMP12]]
528-
; CHECK-NEXT: [[ADD61:%.*]] = add i32 [[ADD52]], [[ADD36]]
529-
; CHECK-NEXT: [[SUB62:%.*]] = sub i32 [[ADD36]], [[ADD52]]
530-
; CHECK-NEXT: [[ADD63:%.*]] = add i32 [[SUB60]], [[SUB44]]
531-
; CHECK-NEXT: [[SUB64:%.*]] = sub i32 [[SUB44]], [[SUB60]]
532-
; CHECK-NEXT: [[ADD65:%.*]] = add i32 [[ADD61]], [[ADD25]]
533-
; CHECK-NEXT: [[SHR_I173:%.*]] = lshr i32 [[ADD65]], 15
534-
; CHECK-NEXT: [[AND_I174:%.*]] = and i32 [[SHR_I173]], 65537
535-
; CHECK-NEXT: [[MUL_I175:%.*]] = mul nuw i32 [[AND_I174]], 65535
536-
; CHECK-NEXT: [[ADD_I176:%.*]] = add i32 [[MUL_I175]], [[ADD65]]
537-
; CHECK-NEXT: [[XOR_I177:%.*]] = xor i32 [[ADD_I176]], [[MUL_I175]]
538-
; CHECK-NEXT: [[SUB66:%.*]] = sub i32 [[ADD25]], [[ADD61]]
539-
; CHECK-NEXT: [[SHR_I168:%.*]] = lshr i32 [[SUB66]], 15
540-
; CHECK-NEXT: [[AND_I169:%.*]] = and i32 [[SHR_I168]], 65537
541-
; CHECK-NEXT: [[MUL_I170:%.*]] = mul nuw i32 [[AND_I169]], 65535
542-
; CHECK-NEXT: [[ADD_I171:%.*]] = add i32 [[MUL_I170]], [[SUB66]]
543-
; CHECK-NEXT: [[XOR_I172:%.*]] = xor i32 [[ADD_I171]], [[MUL_I170]]
544-
; CHECK-NEXT: [[ADD69:%.*]] = add i32 [[ADD63]], [[ADD27]]
545-
; CHECK-NEXT: [[SHR_I163:%.*]] = lshr i32 [[ADD69]], 15
546-
; CHECK-NEXT: [[AND_I164:%.*]] = and i32 [[SHR_I163]], 65537
547-
; CHECK-NEXT: [[MUL_I165:%.*]] = mul nuw i32 [[AND_I164]], 65535
548-
; CHECK-NEXT: [[ADD_I166:%.*]] = add i32 [[MUL_I165]], [[ADD69]]
549-
; CHECK-NEXT: [[XOR_I167:%.*]] = xor i32 [[ADD_I166]], [[MUL_I165]]
550-
; CHECK-NEXT: [[SUB71:%.*]] = sub i32 [[ADD27]], [[ADD63]]
551-
; CHECK-NEXT: [[SHR_I158:%.*]] = lshr i32 [[SUB71]], 15
552-
; CHECK-NEXT: [[AND_I159:%.*]] = and i32 [[SHR_I158]], 65537
553-
; CHECK-NEXT: [[MUL_I160:%.*]] = mul nuw i32 [[AND_I159]], 65535
554-
; CHECK-NEXT: [[ADD_I161:%.*]] = add i32 [[MUL_I160]], [[SUB71]]
555-
; CHECK-NEXT: [[XOR_I162:%.*]] = xor i32 [[ADD_I161]], [[MUL_I160]]
556-
; CHECK-NEXT: [[ADD75:%.*]] = add i32 [[SUB62]], [[SUB26]]
557-
; CHECK-NEXT: [[SHR_I153:%.*]] = lshr i32 [[ADD75]], 15
558-
; CHECK-NEXT: [[AND_I154:%.*]] = and i32 [[SHR_I153]], 65537
559-
; CHECK-NEXT: [[MUL_I155:%.*]] = mul nuw i32 [[AND_I154]], 65535
560-
; CHECK-NEXT: [[ADD_I156:%.*]] = add i32 [[MUL_I155]], [[ADD75]]
561-
; CHECK-NEXT: [[XOR_I157:%.*]] = xor i32 [[ADD_I156]], [[MUL_I155]]
562-
; CHECK-NEXT: [[SUB77:%.*]] = sub i32 [[SUB26]], [[SUB62]]
563-
; CHECK-NEXT: [[SHR_I148:%.*]] = lshr i32 [[SUB77]], 15
564-
; CHECK-NEXT: [[AND_I149:%.*]] = and i32 [[SHR_I148]], 65537
565-
; CHECK-NEXT: [[MUL_I150:%.*]] = mul nuw i32 [[AND_I149]], 65535
566-
; CHECK-NEXT: [[ADD_I151:%.*]] = add i32 [[MUL_I150]], [[SUB77]]
567-
; CHECK-NEXT: [[XOR_I152:%.*]] = xor i32 [[ADD_I151]], [[MUL_I150]]
568-
; CHECK-NEXT: [[ADD81:%.*]] = add i32 [[SUB64]], [[SUB28]]
569-
; CHECK-NEXT: [[SHR_I143:%.*]] = lshr i32 [[ADD81]], 15
570-
; CHECK-NEXT: [[AND_I144:%.*]] = and i32 [[SHR_I143]], 65537
571-
; CHECK-NEXT: [[MUL_I145:%.*]] = mul nuw i32 [[AND_I144]], 65535
572-
; CHECK-NEXT: [[ADD_I146:%.*]] = add i32 [[MUL_I145]], [[ADD81]]
573-
; CHECK-NEXT: [[XOR_I147:%.*]] = xor i32 [[ADD_I146]], [[MUL_I145]]
574-
; CHECK-NEXT: [[SUB83:%.*]] = sub i32 [[SUB28]], [[SUB64]]
575-
; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[SUB83]], 15
576-
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
577-
; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535
578-
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUB83]]
579-
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]]
580-
; CHECK-NEXT: [[ADD73:%.*]] = add i32 [[XOR_I147]], [[XOR_I]]
581-
; CHECK-NEXT: [[ADD68:%.*]] = add i32 [[ADD73]], [[XOR_I152]]
582-
; CHECK-NEXT: [[ADD74:%.*]] = add i32 [[ADD68]], [[XOR_I157]]
583-
; CHECK-NEXT: [[ADD79:%.*]] = add i32 [[ADD74]], [[XOR_I172]]
584-
; CHECK-NEXT: [[ADD80:%.*]] = add i32 [[ADD79]], [[XOR_I177]]
585-
; CHECK-NEXT: [[ADD85:%.*]] = add i32 [[ADD80]], [[XOR_I162]]
586-
; CHECK-NEXT: [[ADD86:%.*]] = add i32 [[ADD85]], [[XOR_I167]]
587-
; CHECK-NEXT: [[CONV87:%.*]] = and i32 [[ADD86]], 65535
588-
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD86]], 16
589-
; CHECK-NEXT: [[ADD88:%.*]] = add i32 [[SHR]], [[SUM_0179]]
590-
; CHECK-NEXT: [[ADD89]] = add i32 [[ADD88]], [[CONV87]]
591-
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
592-
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
593-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
526+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
527+
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP26]], align 4
528+
; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
529+
; CHECK-NEXT: [[TMP28:%.*]] = sub <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
530+
; CHECK-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP27]], [[TMP21]]
531+
; CHECK-NEXT: [[TMP30:%.*]] = sub <4 x i32> [[TMP21]], [[TMP27]]
532+
; CHECK-NEXT: [[TMP31:%.*]] = add <4 x i32> [[TMP28]], [[TMP22]]
533+
; CHECK-NEXT: [[TMP32:%.*]] = sub <4 x i32> [[TMP22]], [[TMP28]]
534+
; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP29]], [[TMP13]]
535+
; CHECK-NEXT: [[TMP34:%.*]] = lshr <4 x i32> [[TMP33]], splat (i32 15)
536+
; CHECK-NEXT: [[TMP35:%.*]] = and <4 x i32> [[TMP34]], splat (i32 65537)
537+
; CHECK-NEXT: [[TMP36:%.*]] = mul nuw <4 x i32> [[TMP35]], splat (i32 65535)
538+
; CHECK-NEXT: [[TMP37:%.*]] = add <4 x i32> [[TMP36]], [[TMP33]]
539+
; CHECK-NEXT: [[TMP38:%.*]] = xor <4 x i32> [[TMP37]], [[TMP36]]
540+
; CHECK-NEXT: [[TMP39:%.*]] = sub <4 x i32> [[TMP13]], [[TMP29]]
541+
; CHECK-NEXT: [[TMP40:%.*]] = lshr <4 x i32> [[TMP39]], splat (i32 15)
542+
; CHECK-NEXT: [[TMP41:%.*]] = and <4 x i32> [[TMP40]], splat (i32 65537)
543+
; CHECK-NEXT: [[TMP42:%.*]] = mul nuw <4 x i32> [[TMP41]], splat (i32 65535)
544+
; CHECK-NEXT: [[TMP43:%.*]] = add <4 x i32> [[TMP42]], [[TMP39]]
545+
; CHECK-NEXT: [[TMP44:%.*]] = xor <4 x i32> [[TMP43]], [[TMP42]]
546+
; CHECK-NEXT: [[TMP45:%.*]] = add <4 x i32> [[TMP31]], [[TMP15]]
547+
; CHECK-NEXT: [[TMP46:%.*]] = lshr <4 x i32> [[TMP45]], splat (i32 15)
548+
; CHECK-NEXT: [[TMP47:%.*]] = and <4 x i32> [[TMP46]], splat (i32 65537)
549+
; CHECK-NEXT: [[TMP48:%.*]] = mul nuw <4 x i32> [[TMP47]], splat (i32 65535)
550+
; CHECK-NEXT: [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP45]]
551+
; CHECK-NEXT: [[TMP50:%.*]] = xor <4 x i32> [[TMP49]], [[TMP48]]
552+
; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP15]], [[TMP31]]
553+
; CHECK-NEXT: [[TMP52:%.*]] = lshr <4 x i32> [[TMP51]], splat (i32 15)
554+
; CHECK-NEXT: [[TMP53:%.*]] = and <4 x i32> [[TMP52]], splat (i32 65537)
555+
; CHECK-NEXT: [[TMP54:%.*]] = mul nuw <4 x i32> [[TMP53]], splat (i32 65535)
556+
; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP51]]
557+
; CHECK-NEXT: [[TMP56:%.*]] = xor <4 x i32> [[TMP55]], [[TMP54]]
558+
; CHECK-NEXT: [[TMP57:%.*]] = add <4 x i32> [[TMP30]], [[TMP14]]
559+
; CHECK-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], splat (i32 15)
560+
; CHECK-NEXT: [[TMP59:%.*]] = and <4 x i32> [[TMP58]], splat (i32 65537)
561+
; CHECK-NEXT: [[TMP60:%.*]] = mul nuw <4 x i32> [[TMP59]], splat (i32 65535)
562+
; CHECK-NEXT: [[TMP61:%.*]] = add <4 x i32> [[TMP60]], [[TMP57]]
563+
; CHECK-NEXT: [[TMP62:%.*]] = xor <4 x i32> [[TMP61]], [[TMP60]]
564+
; CHECK-NEXT: [[TMP63:%.*]] = sub <4 x i32> [[TMP14]], [[TMP30]]
565+
; CHECK-NEXT: [[TMP64:%.*]] = lshr <4 x i32> [[TMP63]], splat (i32 15)
566+
; CHECK-NEXT: [[TMP65:%.*]] = and <4 x i32> [[TMP64]], splat (i32 65537)
567+
; CHECK-NEXT: [[TMP66:%.*]] = mul nuw <4 x i32> [[TMP65]], splat (i32 65535)
568+
; CHECK-NEXT: [[TMP67:%.*]] = add <4 x i32> [[TMP66]], [[TMP63]]
569+
; CHECK-NEXT: [[TMP68:%.*]] = xor <4 x i32> [[TMP67]], [[TMP66]]
570+
; CHECK-NEXT: [[TMP69:%.*]] = add <4 x i32> [[TMP32]], [[TMP16]]
571+
; CHECK-NEXT: [[TMP70:%.*]] = lshr <4 x i32> [[TMP69]], splat (i32 15)
572+
; CHECK-NEXT: [[TMP71:%.*]] = and <4 x i32> [[TMP70]], splat (i32 65537)
573+
; CHECK-NEXT: [[TMP72:%.*]] = mul nuw <4 x i32> [[TMP71]], splat (i32 65535)
574+
; CHECK-NEXT: [[TMP73:%.*]] = add <4 x i32> [[TMP72]], [[TMP69]]
575+
; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i32> [[TMP73]], [[TMP72]]
576+
; CHECK-NEXT: [[TMP75:%.*]] = sub <4 x i32> [[TMP16]], [[TMP32]]
577+
; CHECK-NEXT: [[TMP76:%.*]] = lshr <4 x i32> [[TMP75]], splat (i32 15)
578+
; CHECK-NEXT: [[TMP77:%.*]] = and <4 x i32> [[TMP76]], splat (i32 65537)
579+
; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <4 x i32> [[TMP77]], splat (i32 65535)
580+
; CHECK-NEXT: [[TMP79:%.*]] = add <4 x i32> [[TMP78]], [[TMP75]]
581+
; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i32> [[TMP79]], [[TMP78]]
582+
; CHECK-NEXT: [[TMP81:%.*]] = add <4 x i32> [[TMP74]], [[TMP80]]
583+
; CHECK-NEXT: [[TMP82:%.*]] = add <4 x i32> [[TMP81]], [[TMP68]]
584+
; CHECK-NEXT: [[TMP83:%.*]] = add <4 x i32> [[TMP82]], [[TMP62]]
585+
; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP83]], [[TMP44]]
586+
; CHECK-NEXT: [[TMP85:%.*]] = add <4 x i32> [[TMP84]], [[TMP38]]
587+
; CHECK-NEXT: [[TMP86:%.*]] = add <4 x i32> [[TMP85]], [[TMP56]]
588+
; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP50]]
589+
; CHECK-NEXT: [[TMP88:%.*]] = and <4 x i32> [[TMP87]], splat (i32 65535)
590+
; CHECK-NEXT: [[TMP89:%.*]] = lshr <4 x i32> [[TMP87]], splat (i32 16)
591+
; CHECK-NEXT: [[TMP90:%.*]] = add <4 x i32> [[TMP89]], zeroinitializer
592+
; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP90]], [[TMP88]]
593+
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 4
594+
; CHECK-NEXT: [[TMP92:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP91]])
595+
; CHECK-NEXT: ret i32 [[TMP92]]
594596
;
595597
entry:
596598
%arrayidx2 = getelementptr inbounds nuw i8, ptr %tmp, i64 16

0 commit comments

Comments
 (0)