Skip to content

Commit 9c95cba

Browse files
committed
[LoopVectorizer] Prune VFs based on plan register pressure
Based on fhahn's work at llvm#126437 . This PR moves the register usage checking to after the plans are created, so that any recipes that optimise register usage (such as partial reductions) can be properly costed and not have their VF pruned unnecessarily. It involves changing some tests, notably removing one from mve-known-tripcount.ll due to it not being vectorisable thanks to high register pressure. tail-folding-reduces-vf.ll was modified to reduce its register pressure but still test what was intended.
1 parent 04fde85 commit 9c95cba

File tree

10 files changed

+262
-665
lines changed

10 files changed

+262
-665
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 46 additions & 234 deletions
Original file line numberDiff line numberDiff line change
@@ -998,11 +998,6 @@ class LoopVectorizationCostModel {
998998
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
999999
};
10001000

1001-
/// \return Returns information about the register usages of the loop for the
1002-
/// given vectorization factors.
1003-
SmallVector<RegisterUsage, 8>
1004-
calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1005-
10061001
/// Collect values we want to ignore in the cost model.
10071002
void collectValuesToIgnore();
10081003

@@ -4015,27 +4010,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
40154010
ComputeScalableMaxVF);
40164011
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
40174012

4018-
// Collect all viable vectorization factors larger than the default MaxVF
4019-
// (i.e. MaxVectorElementCount).
4020-
SmallVector<ElementCount, 8> VFs;
4013+
// Set the max VF to the largest viable vectorization factor less than or
4014+
// equal to the max vector element count.
40214015
for (ElementCount VS = MaxVectorElementCount * 2;
40224016
ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4023-
VFs.push_back(VS);
4024-
4025-
// For each VF calculate its register usage.
4026-
auto RUs = calculateRegisterUsage(VFs);
4017+
MaxVF = VS;
40274018

4028-
// Select the largest VF which doesn't require more registers than existing
4029-
// ones.
4030-
for (int I = RUs.size() - 1; I >= 0; --I) {
4031-
const auto &MLU = RUs[I].MaxLocalUsers;
4032-
if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4033-
return LU.second <= TTI.getNumberOfRegisters(LU.first);
4034-
})) {
4035-
MaxVF = VFs[I];
4036-
break;
4037-
}
4038-
}
40394019
if (ElementCount MinVF =
40404020
TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
40414021
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
@@ -5234,213 +5214,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
52345214
return 1;
52355215
}
52365216

5237-
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5238-
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5239-
// This function calculates the register usage by measuring the highest number
5240-
// of values that are alive at a single location. Obviously, this is a very
5241-
// rough estimation. We scan the loop in a topological order in order and
5242-
// assign a number to each instruction. We use RPO to ensure that defs are
5243-
// met before their users. We assume that each instruction that has in-loop
5244-
// users starts an interval. We record every time that an in-loop value is
5245-
// used, so we have a list of the first and last occurrences of each
5246-
// instruction. Next, we transpose this data structure into a multi map that
5247-
// holds the list of intervals that *end* at a specific location. This multi
5248-
// map allows us to perform a linear search. We scan the instructions linearly
5249-
// and record each time that a new interval starts, by placing it in a set.
5250-
// If we find this value in the multi-map then we remove it from the set.
5251-
// The max register usage is the maximum size of the set.
5252-
// We also search for instructions that are defined outside the loop, but are
5253-
// used inside the loop. We need this number separately from the max-interval
5254-
// usage number because when we unroll, loop-invariant values do not take
5255-
// more registers.
5256-
LoopBlocksDFS DFS(TheLoop);
5257-
DFS.perform(LI);
5258-
5259-
RegisterUsage RU;
5260-
5261-
// Each 'key' in the map opens a new interval. The values
5262-
// of the map are the index of the 'last seen' usage of the
5263-
// instruction that is the key.
5264-
using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5265-
5266-
// Maps instruction to its index.
5267-
SmallVector<Instruction *, 64> IdxToInstr;
5268-
// Marks the end of each interval.
5269-
IntervalMap EndPoint;
5270-
// Saves the list of instruction indices that are used in the loop.
5271-
SmallPtrSet<Instruction *, 8> Ends;
5272-
// Saves the list of values that are used in the loop but are defined outside
5273-
// the loop (not including non-instruction values such as arguments and
5274-
// constants).
5275-
SmallSetVector<Instruction *, 8> LoopInvariants;
5276-
5277-
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5278-
for (Instruction &I : BB->instructionsWithoutDebug()) {
5279-
IdxToInstr.push_back(&I);
5280-
5281-
// Save the end location of each USE.
5282-
for (Value *U : I.operands()) {
5283-
auto *Instr = dyn_cast<Instruction>(U);
5284-
5285-
// Ignore non-instruction values such as arguments, constants, etc.
5286-
// FIXME: Might need some motivation why these values are ignored. If
5287-
// for example an argument is used inside the loop it will increase the
5288-
// register pressure (so shouldn't we add it to LoopInvariants).
5289-
if (!Instr)
5290-
continue;
5291-
5292-
// If this instruction is outside the loop then record it and continue.
5293-
if (!TheLoop->contains(Instr)) {
5294-
LoopInvariants.insert(Instr);
5295-
continue;
5296-
}
5297-
5298-
// Overwrite previous end points.
5299-
EndPoint[Instr] = IdxToInstr.size();
5300-
Ends.insert(Instr);
5301-
}
5302-
}
5303-
}
5304-
5305-
// Saves the list of intervals that end with the index in 'key'.
5306-
using InstrList = SmallVector<Instruction *, 2>;
5307-
SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5308-
5309-
// Transpose the EndPoints to a list of values that end at each index.
5310-
for (auto &Interval : EndPoint)
5311-
TransposeEnds[Interval.second].push_back(Interval.first);
5312-
5313-
SmallPtrSet<Instruction *, 8> OpenIntervals;
5314-
SmallVector<RegisterUsage, 8> RUs(VFs.size());
5315-
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5316-
5317-
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5318-
5319-
const auto &TTICapture = TTI;
5320-
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5321-
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5322-
(VF.isScalable() &&
5323-
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
5324-
return 0;
5325-
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5326-
};
5327-
5328-
collectInLoopReductions();
5329-
5330-
for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5331-
Instruction *I = IdxToInstr[Idx];
5332-
5333-
// Remove all of the instructions that end at this location.
5334-
InstrList &List = TransposeEnds[Idx];
5335-
for (Instruction *ToRemove : List)
5336-
OpenIntervals.erase(ToRemove);
5337-
5338-
// Ignore instructions that are never used within the loop and do not have
5339-
// side-effects.
5340-
if (!Ends.count(I) && !I->mayHaveSideEffects())
5341-
continue;
5342-
5343-
// Skip ignored values.
5344-
if (ValuesToIgnore.count(I))
5345-
continue;
5346-
5347-
// For each VF find the maximum usage of registers.
5348-
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5349-
// Count the number of registers used, per register class, given all open
5350-
// intervals.
5351-
// Note that elements in this SmallMapVector will be default constructed
5352-
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5353-
// there is no previous entry for ClassID.
5354-
SmallMapVector<unsigned, unsigned, 4> RegUsage;
5355-
5356-
if (VFs[J].isScalar()) {
5357-
for (auto *Inst : OpenIntervals) {
5358-
unsigned ClassID =
5359-
TTI.getRegisterClassForType(false, Inst->getType());
5360-
// FIXME: The target might use more than one register for the type
5361-
// even in the scalar case.
5362-
RegUsage[ClassID] += 1;
5363-
}
5364-
} else {
5365-
collectNonVectorizedAndSetWideningDecisions(VFs[J]);
5366-
for (auto *Inst : OpenIntervals) {
5367-
// Skip ignored values for VF > 1.
5368-
if (VecValuesToIgnore.count(Inst))
5369-
continue;
5370-
if (isScalarAfterVectorization(Inst, VFs[J])) {
5371-
unsigned ClassID =
5372-
TTI.getRegisterClassForType(false, Inst->getType());
5373-
// FIXME: The target might use more than one register for the type
5374-
// even in the scalar case.
5375-
RegUsage[ClassID] += 1;
5376-
} else {
5377-
unsigned ClassID =
5378-
TTI.getRegisterClassForType(true, Inst->getType());
5379-
RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5380-
}
5381-
}
5382-
}
5383-
5384-
for (const auto &Pair : RegUsage) {
5385-
auto &Entry = MaxUsages[J][Pair.first];
5386-
Entry = std::max(Entry, Pair.second);
5387-
}
5388-
}
5389-
5390-
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5391-
<< OpenIntervals.size() << '\n');
5392-
5393-
// Add the current instruction to the list of open intervals.
5394-
OpenIntervals.insert(I);
5395-
}
5396-
5397-
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5398-
// Note that elements in this SmallMapVector will be default constructed
5399-
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5400-
// there is no previous entry for ClassID.
5401-
SmallMapVector<unsigned, unsigned, 4> Invariant;
5402-
5403-
for (auto *Inst : LoopInvariants) {
5404-
// FIXME: The target might use more than one register for the type
5405-
// even in the scalar case.
5406-
bool IsScalar = all_of(Inst->users(), [&](User *U) {
5407-
auto *I = cast<Instruction>(U);
5408-
return TheLoop != LI->getLoopFor(I->getParent()) ||
5409-
isScalarAfterVectorization(I, VFs[Idx]);
5410-
});
5411-
5412-
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5413-
unsigned ClassID =
5414-
TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5415-
Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5416-
}
5417-
5418-
LLVM_DEBUG({
5419-
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5420-
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5421-
<< " item\n";
5422-
for (const auto &pair : MaxUsages[Idx]) {
5423-
dbgs() << "LV(REG): RegisterClass: "
5424-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5425-
<< " registers\n";
5426-
}
5427-
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5428-
<< " item\n";
5429-
for (const auto &pair : Invariant) {
5430-
dbgs() << "LV(REG): RegisterClass: "
5431-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5432-
<< " registers\n";
5433-
}
5434-
});
5435-
5436-
RU.LoopInvariantRegs = Invariant;
5437-
RU.MaxLocalUsers = MaxUsages[Idx];
5438-
RUs[Idx] = RU;
5439-
}
5440-
5441-
return RUs;
5442-
}
5443-
54445217
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
54455218
ElementCount VF) {
54465219
// TODO: Cost model for emulated masked load/store is completely
@@ -7621,7 +7394,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
76217394
}
76227395

76237396
for (auto &P : VPlans) {
7624-
for (ElementCount VF : P->vectorFactors()) {
7397+
SmallVector<ElementCount, 1> VFs(P->vectorFactors());
7398+
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
7399+
for (unsigned I = 0; I < VFs.size(); I++) {
7400+
auto VF = VFs[I];
76257401
if (VF.isScalar())
76267402
continue;
76277403
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -7642,12 +7418,23 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
76427418

76437419
InstructionCost Cost = cost(*P, VF);
76447420
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7645-
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7646-
BestFactor = CurrentFactor;
7647-
76487421
// If profitable add it to ProfitableVF list.
76497422
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
76507423
ProfitableVFs.push_back(CurrentFactor);
7424+
7425+
// Make sure that the VF doesn't use more than the number of available
7426+
// registers
7427+
const auto &MLU = RUs[I].MaxLocalUsers;
7428+
if (any_of(MLU, [&](decltype(MLU.front()) &LU) {
7429+
return LU.second > TTI.getNumberOfRegisters(LU.first);
7430+
})) {
7431+
LLVM_DEBUG(dbgs() << "LV(REG): Ignoring VF " << VF
7432+
<< " as it uses too many registers\n");
7433+
continue;
7434+
}
7435+
7436+
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7437+
BestFactor = CurrentFactor;
76517438
}
76527439
}
76537440

@@ -7659,6 +7446,30 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
76597446
VectorizationFactor LegacyVF = selectVectorizationFactor();
76607447
VPlan &BestPlan = getPlanFor(BestFactor.Width);
76617448

7449+
// VPlan calculates register pressure from the plan, so it can come to
7450+
// different conclusions than the legacy cost model.
7451+
bool RegUsageDeterminedVF = false;
7452+
if (BestFactor.Width != LegacyVF.Width) {
7453+
SmallVector<ElementCount, 1> LegacyVFs = {LegacyVF.Width};
7454+
SmallVector<ElementCount, 1> VFs = {BestFactor.Width};
7455+
7456+
auto LegacyRUs =
7457+
::calculateRegisterUsage(getPlanFor(LegacyVF.Width), LegacyVFs, TTI, CM.ValuesToIgnore);
7458+
auto RUs = ::calculateRegisterUsage(BestPlan, VFs, TTI, CM.ValuesToIgnore);
7459+
7460+
auto GetMaxUsage = [](
7461+
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers) {
7462+
unsigned Max = 0;
7463+
for (auto Pair : MaxLocalUsers)
7464+
if (Pair.second > Max)
7465+
Max = Pair.second;
7466+
return Max;
7467+
};
7468+
unsigned MaxLegacyRegUsage = GetMaxUsage(LegacyRUs[0].MaxLocalUsers);
7469+
unsigned MaxRegUsage = GetMaxUsage(RUs[0].MaxLocalUsers);
7470+
RegUsageDeterminedVF = MaxRegUsage <= MaxLegacyRegUsage;
7471+
}
7472+
76627473
// Pre-compute the cost and use it to check if BestPlan contains any
76637474
// simplifications not accounted for in the legacy cost model. If that's the
76647475
// case, don't trigger the assertion, as the extra simplifications may cause a
@@ -7670,6 +7481,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
76707481
// with early exits and plans with additional VPlan simplifications. The
76717482
// legacy cost model doesn't properly model costs for such loops.
76727483
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7484+
RegUsageDeterminedVF ||
76737485
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
76747486
CostCtx, OrigLoop) ||
76757487
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -798,9 +798,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
798798
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
799799
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
800800
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
801-
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
801+
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
802802
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
803-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
803+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP54]], align 1
804804
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
805805
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
806806
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
@@ -840,9 +840,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
840840
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
841841
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
842842
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
843-
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
843+
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
844844
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
845-
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
845+
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP38]]
846846
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
847847
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
848848
; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
@@ -869,10 +869,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
869869
; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
870870
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
871871
; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
872-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
873-
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
874-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
872+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
875873
; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
874+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX33:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
875+
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX33]])
876876
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
877877
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
878878
; CHECK-INTERLEAVED: scalar.ph:
@@ -946,6 +946,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
946946
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
947947
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
948948
; CHECK-MAXBW: scalar.ph:
949+
;
949950
entry:
950951
br label %for.body
951952

0 commit comments

Comments
 (0)