Skip to content

Commit 95e4060

Browse files
committed
[LoopVectorizer] Prune VFs based on plan register pressure
Based on fhahn's work at llvm#126437 . This PR moves the register usage checking to after the plans are created, so that any recipes that optimise register usage (such as partial reductions) can be properly costed and not have their VF pruned unnecessarily. It involves changing some tests, notably removing one from mve-known-tripcount.ll due to it not being vectorisable thanks to high register pressure. tail-folding-reduces-vf.ll was modified to reduce its register pressure but still test what was intended.
1 parent 1467b3b commit 95e4060

File tree

10 files changed

+286
-662
lines changed

10 files changed

+286
-662
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 46 additions & 234 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,11 +1022,6 @@ class LoopVectorizationCostModel {
10221022
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
10231023
};
10241024

1025-
/// \return Returns information about the register usages of the loop for the
1026-
/// given vectorization factors.
1027-
SmallVector<RegisterUsage, 8>
1028-
calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1029-
10301025
/// Collect values we want to ignore in the cost model.
10311026
void collectValuesToIgnore();
10321027

@@ -4189,27 +4184,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
41894184
ComputeScalableMaxVF);
41904185
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
41914186

4192-
// Collect all viable vectorization factors larger than the default MaxVF
4193-
// (i.e. MaxVectorElementCount).
4194-
SmallVector<ElementCount, 8> VFs;
4187+
// Set the max VF to the largest viable vectorization factor less than or
4188+
// equal to the max vector element count.
41954189
for (ElementCount VS = MaxVectorElementCount * 2;
41964190
ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4197-
VFs.push_back(VS);
4198-
4199-
// For each VF calculate its register usage.
4200-
auto RUs = calculateRegisterUsage(VFs);
4191+
MaxVF = VS;
42014192

4202-
// Select the largest VF which doesn't require more registers than existing
4203-
// ones.
4204-
for (int I = RUs.size() - 1; I >= 0; --I) {
4205-
const auto &MLU = RUs[I].MaxLocalUsers;
4206-
if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4207-
return LU.second <= TTI.getNumberOfRegisters(LU.first);
4208-
})) {
4209-
MaxVF = VFs[I];
4210-
break;
4211-
}
4212-
}
42134193
if (ElementCount MinVF =
42144194
TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
42154195
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
@@ -5406,213 +5386,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
54065386
return 1;
54075387
}
54085388

5409-
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5410-
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5411-
// This function calculates the register usage by measuring the highest number
5412-
// of values that are alive at a single location. Obviously, this is a very
5413-
// rough estimation. We scan the loop in a topological order in order and
5414-
// assign a number to each instruction. We use RPO to ensure that defs are
5415-
// met before their users. We assume that each instruction that has in-loop
5416-
// users starts an interval. We record every time that an in-loop value is
5417-
// used, so we have a list of the first and last occurrences of each
5418-
// instruction. Next, we transpose this data structure into a multi map that
5419-
// holds the list of intervals that *end* at a specific location. This multi
5420-
// map allows us to perform a linear search. We scan the instructions linearly
5421-
// and record each time that a new interval starts, by placing it in a set.
5422-
// If we find this value in the multi-map then we remove it from the set.
5423-
// The max register usage is the maximum size of the set.
5424-
// We also search for instructions that are defined outside the loop, but are
5425-
// used inside the loop. We need this number separately from the max-interval
5426-
// usage number because when we unroll, loop-invariant values do not take
5427-
// more registers.
5428-
LoopBlocksDFS DFS(TheLoop);
5429-
DFS.perform(LI);
5430-
5431-
RegisterUsage RU;
5432-
5433-
// Each 'key' in the map opens a new interval. The values
5434-
// of the map are the index of the 'last seen' usage of the
5435-
// instruction that is the key.
5436-
using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5437-
5438-
// Maps instruction to its index.
5439-
SmallVector<Instruction *, 64> IdxToInstr;
5440-
// Marks the end of each interval.
5441-
IntervalMap EndPoint;
5442-
// Saves the list of instruction indices that are used in the loop.
5443-
SmallPtrSet<Instruction *, 8> Ends;
5444-
// Saves the list of values that are used in the loop but are defined outside
5445-
// the loop (not including non-instruction values such as arguments and
5446-
// constants).
5447-
SmallSetVector<Instruction *, 8> LoopInvariants;
5448-
5449-
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5450-
for (Instruction &I : BB->instructionsWithoutDebug()) {
5451-
IdxToInstr.push_back(&I);
5452-
5453-
// Save the end location of each USE.
5454-
for (Value *U : I.operands()) {
5455-
auto *Instr = dyn_cast<Instruction>(U);
5456-
5457-
// Ignore non-instruction values such as arguments, constants, etc.
5458-
// FIXME: Might need some motivation why these values are ignored. If
5459-
// for example an argument is used inside the loop it will increase the
5460-
// register pressure (so shouldn't we add it to LoopInvariants).
5461-
if (!Instr)
5462-
continue;
5463-
5464-
// If this instruction is outside the loop then record it and continue.
5465-
if (!TheLoop->contains(Instr)) {
5466-
LoopInvariants.insert(Instr);
5467-
continue;
5468-
}
5469-
5470-
// Overwrite previous end points.
5471-
EndPoint[Instr] = IdxToInstr.size();
5472-
Ends.insert(Instr);
5473-
}
5474-
}
5475-
}
5476-
5477-
// Saves the list of intervals that end with the index in 'key'.
5478-
using InstrList = SmallVector<Instruction *, 2>;
5479-
SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5480-
5481-
// Transpose the EndPoints to a list of values that end at each index.
5482-
for (auto &Interval : EndPoint)
5483-
TransposeEnds[Interval.second].push_back(Interval.first);
5484-
5485-
SmallPtrSet<Instruction *, 8> OpenIntervals;
5486-
SmallVector<RegisterUsage, 8> RUs(VFs.size());
5487-
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5488-
5489-
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5490-
5491-
const auto &TTICapture = TTI;
5492-
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5493-
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5494-
(VF.isScalable() &&
5495-
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
5496-
return 0;
5497-
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5498-
};
5499-
5500-
collectInLoopReductions();
5501-
5502-
for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5503-
Instruction *I = IdxToInstr[Idx];
5504-
5505-
// Remove all of the instructions that end at this location.
5506-
InstrList &List = TransposeEnds[Idx];
5507-
for (Instruction *ToRemove : List)
5508-
OpenIntervals.erase(ToRemove);
5509-
5510-
// Ignore instructions that are never used within the loop and do not have
5511-
// side-effects.
5512-
if (!Ends.count(I) && !I->mayHaveSideEffects())
5513-
continue;
5514-
5515-
// Skip ignored values.
5516-
if (ValuesToIgnore.count(I))
5517-
continue;
5518-
5519-
// For each VF find the maximum usage of registers.
5520-
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5521-
// Count the number of registers used, per register class, given all open
5522-
// intervals.
5523-
// Note that elements in this SmallMapVector will be default constructed
5524-
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5525-
// there is no previous entry for ClassID.
5526-
SmallMapVector<unsigned, unsigned, 4> RegUsage;
5527-
5528-
if (VFs[J].isScalar()) {
5529-
for (auto *Inst : OpenIntervals) {
5530-
unsigned ClassID =
5531-
TTI.getRegisterClassForType(false, Inst->getType());
5532-
// FIXME: The target might use more than one register for the type
5533-
// even in the scalar case.
5534-
RegUsage[ClassID] += 1;
5535-
}
5536-
} else {
5537-
collectNonVectorizedAndSetWideningDecisions(VFs[J]);
5538-
for (auto *Inst : OpenIntervals) {
5539-
// Skip ignored values for VF > 1.
5540-
if (VecValuesToIgnore.count(Inst))
5541-
continue;
5542-
if (isScalarAfterVectorization(Inst, VFs[J])) {
5543-
unsigned ClassID =
5544-
TTI.getRegisterClassForType(false, Inst->getType());
5545-
// FIXME: The target might use more than one register for the type
5546-
// even in the scalar case.
5547-
RegUsage[ClassID] += 1;
5548-
} else {
5549-
unsigned ClassID =
5550-
TTI.getRegisterClassForType(true, Inst->getType());
5551-
RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5552-
}
5553-
}
5554-
}
5555-
5556-
for (const auto &Pair : RegUsage) {
5557-
auto &Entry = MaxUsages[J][Pair.first];
5558-
Entry = std::max(Entry, Pair.second);
5559-
}
5560-
}
5561-
5562-
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5563-
<< OpenIntervals.size() << '\n');
5564-
5565-
// Add the current instruction to the list of open intervals.
5566-
OpenIntervals.insert(I);
5567-
}
5568-
5569-
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5570-
// Note that elements in this SmallMapVector will be default constructed
5571-
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5572-
// there is no previous entry for ClassID.
5573-
SmallMapVector<unsigned, unsigned, 4> Invariant;
5574-
5575-
for (auto *Inst : LoopInvariants) {
5576-
// FIXME: The target might use more than one register for the type
5577-
// even in the scalar case.
5578-
bool IsScalar = all_of(Inst->users(), [&](User *U) {
5579-
auto *I = cast<Instruction>(U);
5580-
return TheLoop != LI->getLoopFor(I->getParent()) ||
5581-
isScalarAfterVectorization(I, VFs[Idx]);
5582-
});
5583-
5584-
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5585-
unsigned ClassID =
5586-
TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5587-
Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5588-
}
5589-
5590-
LLVM_DEBUG({
5591-
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5592-
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5593-
<< " item\n";
5594-
for (const auto &pair : MaxUsages[Idx]) {
5595-
dbgs() << "LV(REG): RegisterClass: "
5596-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5597-
<< " registers\n";
5598-
}
5599-
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5600-
<< " item\n";
5601-
for (const auto &pair : Invariant) {
5602-
dbgs() << "LV(REG): RegisterClass: "
5603-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5604-
<< " registers\n";
5605-
}
5606-
});
5607-
5608-
RU.LoopInvariantRegs = Invariant;
5609-
RU.MaxLocalUsers = MaxUsages[Idx];
5610-
RUs[Idx] = RU;
5611-
}
5612-
5613-
return RUs;
5614-
}
5615-
56165389
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
56175390
ElementCount VF) {
56185391
// TODO: Cost model for emulated masked load/store is completely
@@ -7780,7 +7553,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
77807553
}
77817554

77827555
for (auto &P : VPlans) {
7783-
for (ElementCount VF : P->vectorFactors()) {
7556+
SmallVector<ElementCount, 1> VFs(P->vectorFactors());
7557+
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
7558+
for (unsigned I = 0; I < VFs.size(); I++) {
7559+
auto VF = VFs[I];
77847560
if (VF.isScalar())
77857561
continue;
77867562
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -7801,12 +7577,23 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78017577

78027578
InstructionCost Cost = cost(*P, VF);
78037579
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7804-
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7805-
BestFactor = CurrentFactor;
7806-
78077580
// If profitable add it to ProfitableVF list.
78087581
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
78097582
ProfitableVFs.push_back(CurrentFactor);
7583+
7584+
// Make sure that the VF doesn't use more than the number of available
7585+
// registers
7586+
const auto &MLU = RUs[I].MaxLocalUsers;
7587+
if (any_of(MLU, [&](decltype(MLU.front()) &LU) {
7588+
return LU.second > TTI.getNumberOfRegisters(LU.first);
7589+
})) {
7590+
LLVM_DEBUG(dbgs() << "LV(REG): Ignoring VF " << VF
7591+
<< " as it uses too many registers\n");
7592+
continue;
7593+
}
7594+
7595+
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7596+
BestFactor = CurrentFactor;
78107597
}
78117598
}
78127599

@@ -7818,6 +7605,30 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78187605
VectorizationFactor LegacyVF = selectVectorizationFactor();
78197606
VPlan &BestPlan = getPlanFor(BestFactor.Width);
78207607

7608+
// VPlan calculates register pressure from the plan, so it can come to
7609+
// different conclusions than the legacy cost model.
7610+
bool RegUsageDeterminedVF = false;
7611+
if (BestFactor.Width != LegacyVF.Width) {
7612+
SmallVector<ElementCount, 1> LegacyVFs = {LegacyVF.Width};
7613+
SmallVector<ElementCount, 1> VFs = {BestFactor.Width};
7614+
7615+
auto LegacyRUs =
7616+
::calculateRegisterUsage(getPlanFor(LegacyVF.Width), LegacyVFs, TTI, CM.ValuesToIgnore);
7617+
auto RUs = ::calculateRegisterUsage(BestPlan, VFs, TTI, CM.ValuesToIgnore);
7618+
7619+
auto GetMaxUsage = [](
7620+
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers) {
7621+
unsigned Max = 0;
7622+
for (auto Pair : MaxLocalUsers)
7623+
if (Pair.second > Max)
7624+
Max = Pair.second;
7625+
return Max;
7626+
};
7627+
unsigned MaxLegacyRegUsage = GetMaxUsage(LegacyRUs[0].MaxLocalUsers);
7628+
unsigned MaxRegUsage = GetMaxUsage(RUs[0].MaxLocalUsers);
7629+
RegUsageDeterminedVF = MaxRegUsage <= MaxLegacyRegUsage;
7630+
}
7631+
78217632
// Pre-compute the cost and use it to check if BestPlan contains any
78227633
// simplifications not accounted for in the legacy cost model. If that's the
78237634
// case, don't trigger the assertion, as the extra simplifications may cause a
@@ -7829,6 +7640,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78297640
// with early exits and plans with additional VPlan simplifications. The
78307641
// legacy cost model doesn't properly model costs for such loops.
78317642
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7643+
RegUsageDeterminedVF ||
78327644
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
78337645
CostCtx, OrigLoop) ||
78347646
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -798,9 +798,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
798798
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
799799
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
800800
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
801-
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
801+
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
802802
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
803-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
803+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP54]], align 1
804804
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
805805
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
806806
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
@@ -840,9 +840,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
840840
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
841841
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
842842
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
843-
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
843+
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
844844
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
845-
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
845+
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP38]]
846846
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
847847
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
848848
; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
@@ -869,10 +869,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
869869
; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
870870
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
871871
; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
872-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
873-
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
874-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
872+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
875873
; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
874+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX33:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
875+
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX33]])
876876
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
877877
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
878878
; CHECK-INTERLEAVED: scalar.ph:
@@ -946,6 +946,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
946946
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
947947
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
948948
; CHECK-MAXBW: scalar.ph:
949+
;
949950
entry:
950951
br label %for.body
951952

0 commit comments

Comments
 (0)