Skip to content

Commit c3f5000

Browse files
committed
Address comments
1 parent 1dbdf29 commit c3f5000

File tree

2 files changed

+58
-213
lines changed

2 files changed

+58
-213
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4027,7 +4027,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40274027
MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
40284028
}
40294029

4030-
auto IsKnownModTripCountZero = [this, &UserIC](unsigned MaxVF) {
4030+
auto ScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
40314031
unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
40324032
ScalarEvolution *SE = PSE.getSE();
40334033
// Currently only loops with countable exits are vectorized, but calling
@@ -4045,23 +4045,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40454045
return Rem->isZero();
40464046
};
40474047

4048-
if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4048+
if (MaxPowerOf2RuntimeVF > 0) {
40494049
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
40504050
"MaxFixedVF must be a power of 2");
4051-
if (IsKnownModTripCountZero(*MaxPowerOf2RuntimeVF)) {
4051+
if (ScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
40524052
// Accept MaxFixedVF if we do not have a tail.
40534053
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
40544054
return MaxFactors;
40554055
}
40564056
}
40574057

4058-
if (MaxTC && MaxTC <= TTI.getMinTripCountTailFoldingThreshold()) {
4059-
if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4058+
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
4059+
if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
4060+
if (MaxPowerOf2RuntimeVF > 0) {
40604061
// If we have a low-trip-count, and the fixed-width VF is known to divide
40614062
// the trip count but the scalable factor does not, use the fixed-width
40624063
// factor in preference to allow the generation of a non-predicated loop.
40634064
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
4064-
IsKnownModTripCountZero(MaxFactors.FixedVF.getFixedValue())) {
4065+
ScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
40654066
LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
40664067
"remain for any chosen VF.\n");
40674068
MaxFactors.ScalableVF = ElementCount::getScalable(0);

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll

Lines changed: 51 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -483,233 +483,75 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
483483
; CHECK-LABEL: define i32 @tc4(
484484
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
485485
; CHECK-NEXT: [[ENTRY:.*:]]
486-
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 16
487-
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 32
488-
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 48
489-
; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 64
490-
; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 80
491-
; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 96
492-
; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 112
493-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
494-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[TMP0]]
495-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
486+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = add i64 0, 0
487+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
488+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX1]], i32 0
496489
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
497-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX2]], i64 0, i64 [[TMP0]]
498-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
499-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
500-
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
501-
; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
502-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[TMP0]]
503-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
504-
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
505-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[TMP0]]
506-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP9]], i32 0
507-
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
508-
; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
509-
; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
510-
; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP11]], [[TMP5]]
511-
; CHECK-NEXT: [[TMP14:%.*]] = sub <4 x i32> [[TMP5]], [[TMP11]]
512-
; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
513-
; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP6]], [[TMP12]]
514-
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX30]], i64 0, i64 [[TMP0]]
515-
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
516-
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4
517-
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX33]], i64 0, i64 [[TMP0]]
518-
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP19]], i32 0
519-
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
520-
; CHECK-NEXT: [[TMP21:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD4]]
521-
; CHECK-NEXT: [[TMP22:%.*]] = sub <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
522-
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX46]], i64 0, i64 [[TMP0]]
523-
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
524-
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4
525-
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX49]], i64 0, i64 [[TMP0]]
526-
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
527-
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP26]], align 4
528-
; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
529-
; CHECK-NEXT: [[TMP28:%.*]] = sub <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
530-
; CHECK-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP27]], [[TMP21]]
531-
; CHECK-NEXT: [[TMP30:%.*]] = sub <4 x i32> [[TMP21]], [[TMP27]]
532-
; CHECK-NEXT: [[TMP31:%.*]] = add <4 x i32> [[TMP28]], [[TMP22]]
533-
; CHECK-NEXT: [[TMP32:%.*]] = sub <4 x i32> [[TMP22]], [[TMP28]]
534-
; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP29]], [[TMP13]]
535-
; CHECK-NEXT: [[TMP34:%.*]] = lshr <4 x i32> [[TMP33]], splat (i32 15)
536-
; CHECK-NEXT: [[TMP35:%.*]] = and <4 x i32> [[TMP34]], splat (i32 65537)
537-
; CHECK-NEXT: [[TMP36:%.*]] = mul nuw <4 x i32> [[TMP35]], splat (i32 65535)
538-
; CHECK-NEXT: [[TMP37:%.*]] = add <4 x i32> [[TMP36]], [[TMP33]]
539-
; CHECK-NEXT: [[TMP38:%.*]] = xor <4 x i32> [[TMP37]], [[TMP36]]
540-
; CHECK-NEXT: [[TMP39:%.*]] = sub <4 x i32> [[TMP13]], [[TMP29]]
541-
; CHECK-NEXT: [[TMP40:%.*]] = lshr <4 x i32> [[TMP39]], splat (i32 15)
542-
; CHECK-NEXT: [[TMP41:%.*]] = and <4 x i32> [[TMP40]], splat (i32 65537)
543-
; CHECK-NEXT: [[TMP42:%.*]] = mul nuw <4 x i32> [[TMP41]], splat (i32 65535)
544-
; CHECK-NEXT: [[TMP43:%.*]] = add <4 x i32> [[TMP42]], [[TMP39]]
545-
; CHECK-NEXT: [[TMP44:%.*]] = xor <4 x i32> [[TMP43]], [[TMP42]]
546-
; CHECK-NEXT: [[TMP45:%.*]] = add <4 x i32> [[TMP31]], [[TMP15]]
547-
; CHECK-NEXT: [[TMP46:%.*]] = lshr <4 x i32> [[TMP45]], splat (i32 15)
548-
; CHECK-NEXT: [[TMP47:%.*]] = and <4 x i32> [[TMP46]], splat (i32 65537)
549-
; CHECK-NEXT: [[TMP48:%.*]] = mul nuw <4 x i32> [[TMP47]], splat (i32 65535)
550-
; CHECK-NEXT: [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP45]]
551-
; CHECK-NEXT: [[TMP50:%.*]] = xor <4 x i32> [[TMP49]], [[TMP48]]
552-
; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP15]], [[TMP31]]
553-
; CHECK-NEXT: [[TMP52:%.*]] = lshr <4 x i32> [[TMP51]], splat (i32 15)
554-
; CHECK-NEXT: [[TMP53:%.*]] = and <4 x i32> [[TMP52]], splat (i32 65537)
555-
; CHECK-NEXT: [[TMP54:%.*]] = mul nuw <4 x i32> [[TMP53]], splat (i32 65535)
556-
; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP51]]
557-
; CHECK-NEXT: [[TMP56:%.*]] = xor <4 x i32> [[TMP55]], [[TMP54]]
558-
; CHECK-NEXT: [[TMP57:%.*]] = add <4 x i32> [[TMP30]], [[TMP14]]
559-
; CHECK-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], splat (i32 15)
560-
; CHECK-NEXT: [[TMP59:%.*]] = and <4 x i32> [[TMP58]], splat (i32 65537)
561-
; CHECK-NEXT: [[TMP60:%.*]] = mul nuw <4 x i32> [[TMP59]], splat (i32 65535)
562-
; CHECK-NEXT: [[TMP61:%.*]] = add <4 x i32> [[TMP60]], [[TMP57]]
563-
; CHECK-NEXT: [[TMP62:%.*]] = xor <4 x i32> [[TMP61]], [[TMP60]]
564-
; CHECK-NEXT: [[TMP63:%.*]] = sub <4 x i32> [[TMP14]], [[TMP30]]
565-
; CHECK-NEXT: [[TMP64:%.*]] = lshr <4 x i32> [[TMP63]], splat (i32 15)
566-
; CHECK-NEXT: [[TMP65:%.*]] = and <4 x i32> [[TMP64]], splat (i32 65537)
567-
; CHECK-NEXT: [[TMP66:%.*]] = mul nuw <4 x i32> [[TMP65]], splat (i32 65535)
568-
; CHECK-NEXT: [[TMP67:%.*]] = add <4 x i32> [[TMP66]], [[TMP63]]
569-
; CHECK-NEXT: [[TMP68:%.*]] = xor <4 x i32> [[TMP67]], [[TMP66]]
570-
; CHECK-NEXT: [[TMP69:%.*]] = add <4 x i32> [[TMP32]], [[TMP16]]
571-
; CHECK-NEXT: [[TMP70:%.*]] = lshr <4 x i32> [[TMP69]], splat (i32 15)
572-
; CHECK-NEXT: [[TMP71:%.*]] = and <4 x i32> [[TMP70]], splat (i32 65537)
573-
; CHECK-NEXT: [[TMP72:%.*]] = mul nuw <4 x i32> [[TMP71]], splat (i32 65535)
574-
; CHECK-NEXT: [[TMP73:%.*]] = add <4 x i32> [[TMP72]], [[TMP69]]
575-
; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i32> [[TMP73]], [[TMP72]]
576-
; CHECK-NEXT: [[TMP75:%.*]] = sub <4 x i32> [[TMP16]], [[TMP32]]
577-
; CHECK-NEXT: [[TMP76:%.*]] = lshr <4 x i32> [[TMP75]], splat (i32 15)
578-
; CHECK-NEXT: [[TMP77:%.*]] = and <4 x i32> [[TMP76]], splat (i32 65537)
579-
; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <4 x i32> [[TMP77]], splat (i32 65535)
580-
; CHECK-NEXT: [[TMP79:%.*]] = add <4 x i32> [[TMP78]], [[TMP75]]
581-
; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i32> [[TMP79]], [[TMP78]]
582-
; CHECK-NEXT: [[TMP81:%.*]] = add <4 x i32> [[TMP74]], [[TMP80]]
583-
; CHECK-NEXT: [[TMP82:%.*]] = add <4 x i32> [[TMP81]], [[TMP68]]
584-
; CHECK-NEXT: [[TMP83:%.*]] = add <4 x i32> [[TMP82]], [[TMP62]]
585-
; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP83]], [[TMP44]]
586-
; CHECK-NEXT: [[TMP85:%.*]] = add <4 x i32> [[TMP84]], [[TMP38]]
587-
; CHECK-NEXT: [[TMP86:%.*]] = add <4 x i32> [[TMP85]], [[TMP56]]
588-
; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP50]]
589-
; CHECK-NEXT: [[TMP88:%.*]] = and <4 x i32> [[TMP87]], splat (i32 65535)
590-
; CHECK-NEXT: [[TMP89:%.*]] = lshr <4 x i32> [[TMP87]], splat (i32 16)
591-
; CHECK-NEXT: [[TMP90:%.*]] = add <4 x i32> [[TMP89]], zeroinitializer
592-
; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP90]], [[TMP88]]
490+
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> zeroinitializer, [[WIDE_LOAD]]
593491
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 4
594-
; CHECK-NEXT: [[TMP92:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP91]])
595-
; CHECK-NEXT: ret i32 [[TMP92]]
492+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
493+
; CHECK-NEXT: ret i32 [[TMP4]]
596494
;
597495
entry:
598-
%arrayidx2 = getelementptr inbounds nuw i8, ptr %tmp, i64 16
599-
%arrayidx11 = getelementptr inbounds nuw i8, ptr %tmp, i64 32
600-
%arrayidx14 = getelementptr inbounds nuw i8, ptr %tmp, i64 48
601-
%arrayidx30 = getelementptr inbounds nuw i8, ptr %tmp, i64 64
602-
%arrayidx33 = getelementptr inbounds nuw i8, ptr %tmp, i64 80
603-
%arrayidx46 = getelementptr inbounds nuw i8, ptr %tmp, i64 96
604-
%arrayidx49 = getelementptr inbounds nuw i8, ptr %tmp, i64 112
605496
br label %for.body
606497

607498
for.cond.cleanup: ; preds = %for.body
608-
%add89.lcssa = phi i32 [ %add89, %for.body ]
609-
ret i32 %add89.lcssa
499+
%add.lcssa = phi i32 [ %add, %for.body ]
500+
ret i32 %add.lcssa
610501

611502
for.body: ; preds = %entry, %for.body
612503
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
613-
%sum.0179 = phi i32 [ 0, %entry ], [ %add89, %for.body ]
504+
%sum.0179 = phi i32 [ 0, %entry ], [ %add, %for.body ]
614505
%arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
615506
%0 = load i32, ptr %arrayidx1, align 4
616-
%arrayidx4 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx2, i64 0, i64 %indvars.iv
617-
%1 = load i32, ptr %arrayidx4, align 4
618-
%add = add i32 %1, %0
619-
%sub = sub i32 %0, %1
620-
%arrayidx13 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx11, i64 0, i64 %indvars.iv
621-
%2 = load i32, ptr %arrayidx13, align 4
622-
%arrayidx16 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx14, i64 0, i64 %indvars.iv
623-
%3 = load i32, ptr %arrayidx16, align 4
624-
%add17 = add i32 %3, %2
625-
%sub24 = sub i32 %2, %3
626-
%add25 = add i32 %add17, %add
627-
%sub26 = sub i32 %add, %add17
628-
%add27 = add i32 %sub24, %sub
629-
%sub28 = sub i32 %sub, %sub24
630-
%arrayidx32 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx30, i64 0, i64 %indvars.iv
631-
%4 = load i32, ptr %arrayidx32, align 4
632-
%arrayidx35 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx33, i64 0, i64 %indvars.iv
633-
%5 = load i32, ptr %arrayidx35, align 4
634-
%add36 = add i32 %5, %4
635-
%sub44 = sub i32 %4, %5
636-
%arrayidx48 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx46, i64 0, i64 %indvars.iv
637-
%6 = load i32, ptr %arrayidx48, align 4
638-
%arrayidx51 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx49, i64 0, i64 %indvars.iv
639-
%7 = load i32, ptr %arrayidx51, align 4
640-
%add52 = add i32 %7, %6
641-
%sub60 = sub i32 %6, %7
642-
%add61 = add i32 %add52, %add36
643-
%sub62 = sub i32 %add36, %add52
644-
%add63 = add i32 %sub60, %sub44
645-
%sub64 = sub i32 %sub44, %sub60
646-
%add65 = add i32 %add61, %add25
647-
%shr.i173 = lshr i32 %add65, 15
648-
%and.i174 = and i32 %shr.i173, 65537
649-
%mul.i175 = mul nuw i32 %and.i174, 65535
650-
%add.i176 = add i32 %mul.i175, %add65
651-
%xor.i177 = xor i32 %add.i176, %mul.i175
652-
%sub66 = sub i32 %add25, %add61
653-
%shr.i168 = lshr i32 %sub66, 15
654-
%and.i169 = and i32 %shr.i168, 65537
655-
%mul.i170 = mul nuw i32 %and.i169, 65535
656-
%add.i171 = add i32 %mul.i170, %sub66
657-
%xor.i172 = xor i32 %add.i171, %mul.i170
658-
%add69 = add i32 %add63, %add27
659-
%shr.i163 = lshr i32 %add69, 15
660-
%and.i164 = and i32 %shr.i163, 65537
661-
%mul.i165 = mul nuw i32 %and.i164, 65535
662-
%add.i166 = add i32 %mul.i165, %add69
663-
%xor.i167 = xor i32 %add.i166, %mul.i165
664-
%sub71 = sub i32 %add27, %add63
665-
%shr.i158 = lshr i32 %sub71, 15
666-
%and.i159 = and i32 %shr.i158, 65537
667-
%mul.i160 = mul nuw i32 %and.i159, 65535
668-
%add.i161 = add i32 %mul.i160, %sub71
669-
%xor.i162 = xor i32 %add.i161, %mul.i160
670-
%add75 = add i32 %sub62, %sub26
671-
%shr.i153 = lshr i32 %add75, 15
672-
%and.i154 = and i32 %shr.i153, 65537
673-
%mul.i155 = mul nuw i32 %and.i154, 65535
674-
%add.i156 = add i32 %mul.i155, %add75
675-
%xor.i157 = xor i32 %add.i156, %mul.i155
676-
%sub77 = sub i32 %sub26, %sub62
677-
%shr.i148 = lshr i32 %sub77, 15
678-
%and.i149 = and i32 %shr.i148, 65537
679-
%mul.i150 = mul nuw i32 %and.i149, 65535
680-
%add.i151 = add i32 %mul.i150, %sub77
681-
%xor.i152 = xor i32 %add.i151, %mul.i150
682-
%add81 = add i32 %sub64, %sub28
683-
%shr.i143 = lshr i32 %add81, 15
684-
%and.i144 = and i32 %shr.i143, 65537
685-
%mul.i145 = mul nuw i32 %and.i144, 65535
686-
%add.i146 = add i32 %mul.i145, %add81
687-
%xor.i147 = xor i32 %add.i146, %mul.i145
688-
%sub83 = sub i32 %sub28, %sub64
689-
%shr.i = lshr i32 %sub83, 15
690-
%and.i = and i32 %shr.i, 65537
691-
%mul.i = mul nuw i32 %and.i, 65535
692-
%add.i = add i32 %mul.i, %sub83
693-
%xor.i = xor i32 %add.i, %mul.i
694-
%add73 = add i32 %xor.i147, %xor.i
695-
%add68 = add i32 %add73, %xor.i152
696-
%add74 = add i32 %add68, %xor.i157
697-
%add79 = add i32 %add74, %xor.i172
698-
%add80 = add i32 %add79, %xor.i177
699-
%add85 = add i32 %add80, %xor.i162
700-
%add86 = add i32 %add85, %xor.i167
701-
%conv87 = and i32 %add86, 65535
702-
%shr = lshr i32 %add86, 16
703-
%add88 = add i32 %shr, %sum.0179
704-
%add89 = add i32 %add88, %conv87
507+
%add = add i32 %sum.0179, %0
705508
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
706509
%exitcond.not = icmp eq i64 %indvars.iv.next, 4
707510
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
708511
}
709512

513+
; This has a trip-count of 4 from a profile.
514+
define i32 @tc4_from_profile(ptr noundef readonly captures(none) %tmp, i64 %N) vscale_range(1,16) {
515+
; CHECK-LABEL: define i32 @tc4_from_profile(
516+
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
517+
; CHECK-NEXT: [[ENTRY:.*]]:
518+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
519+
; CHECK: [[FOR_COND_CLEANUP:.*]]:
520+
; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ]
521+
; CHECK-NEXT: ret i32 [[TMP4]]
522+
; CHECK: [[FOR_BODY]]:
523+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
524+
; CHECK-NEXT: [[SUM_0179:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD]], %[[FOR_BODY]] ]
525+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
526+
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
527+
; CHECK-NEXT: [[ADD]] = add i32 [[SUM_0179]], [[TMP0]]
528+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
529+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
530+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF7:![0-9]+]]
531+
;
532+
entry:
533+
br label %for.body
534+
535+
for.cond.cleanup: ; preds = %for.body
536+
%add.lcssa = phi i32 [ %add, %for.body ]
537+
ret i32 %add.lcssa
538+
539+
for.body: ; preds = %entry, %for.body
540+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
541+
%sum.0179 = phi i32 [ 0, %entry ], [ %add, %for.body ]
542+
%arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
543+
%0 = load i32, ptr %arrayidx1, align 4
544+
%add = add i32 %sum.0179, %0
545+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
546+
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
547+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !prof !2
548+
}
549+
710550

711551
!0 = distinct !{!0, !1}
712552
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
553+
!2 = !{!"branch_weights", i32 10, i32 30}
554+
713555
;.
714556
; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
715557
; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -718,6 +560,7 @@ for.body: ; preds = %entry, %for.body
718560
; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
719561
; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
720562
; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
563+
; CHECK-VS1: [[PROF7]] = !{!"branch_weights", i32 10, i32 30}
721564
;.
722565
; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
723566
; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -726,4 +569,5 @@ for.body: ; preds = %entry, %for.body
726569
; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
727570
; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
728571
; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
572+
; CHECK-VS2: [[PROF7]] = !{!"branch_weights", i32 10, i32 30}
729573
;.

0 commit comments

Comments
 (0)