@@ -69,7 +69,7 @@ static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69
69
cl::init (true ), cl::Hidden);
70
70
71
71
static cl::opt<unsigned > SmallMultiExitLoopUF (
72
- " small-multi-exit-loop-unroll-factor " , cl::init(0 ), cl::Hidden,
72
+ " aarch64- small-multi-exit-loop-unroll" , cl::init(0 ), cl::Hidden,
73
73
cl::desc(
74
74
" Force unrolling of small multi-exit loops with given unroll factor" ));
75
75
@@ -4386,35 +4386,45 @@ static bool shouldUnrollLoopWithInstruction(Instruction &I,
4386
4386
if (I.getType ()->isVectorTy ())
4387
4387
return false ;
4388
4388
4389
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4390
- if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4391
- if (!TTI.isLoweredToCall (F))
4392
- return true ;
4389
+ if (isa<CallBase>(I)) {
4390
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
4391
+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4392
+ if (!TTI.isLoweredToCall (F))
4393
+ return true ;
4393
4394
return false ;
4394
4395
}
4395
4396
4396
4397
return true ;
4397
4398
}
4398
4399
4399
- static InstructionCost getSizeOfLoop (Loop *L, AArch64TTIImpl &TTI) {
4400
+ static unsigned getLoopSize (Loop *L, AArch64TTIImpl &TTI,
4401
+ InstructionCost Budget) {
4400
4402
// Estimate the size of the loop.
4401
- InstructionCost Size = 0 ;
4403
+ InstructionCost LoopCost = 0 ;
4404
+
4405
+ if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4406
+ return 0 ;
4407
+
4402
4408
for (auto *BB : L->getBlocks ()) {
4403
4409
for (auto &I : *BB) {
4404
4410
if (!shouldUnrollLoopWithInstruction (I, TTI))
4405
- return InstructionCost::getInvalid () ;
4411
+ return 0 ;
4406
4412
4407
4413
SmallVector<const Value *, 4 > Operands (I.operand_values ());
4408
4414
InstructionCost Cost =
4409
4415
TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4410
4416
// This can happen with intrinsics that don't currently have a cost model
4411
4417
// or for some operations that require SVE.
4412
4418
if (!Cost.isValid ())
4413
- return InstructionCost::getInvalid ();
4414
- Size += *Cost.getValue ();
4419
+ return 0 ;
4420
+
4421
+ LoopCost += Cost;
4422
+ if (LoopCost > Budget)
4423
+ return 0 ;
4415
4424
}
4416
4425
}
4417
- return Size;
4426
+
4427
+ return *LoopCost.getValue ();
4418
4428
}
4419
4429
4420
4430
static bool shouldUnrollMultiExitLoop (Loop *L, ScalarEvolution &SE,
@@ -4432,12 +4442,8 @@ static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4432
4442
if (MaxTC > 0 && MaxTC <= 32 )
4433
4443
return false ;
4434
4444
4435
- if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4436
- return false ;
4437
-
4438
4445
// Estimate the size of the loop.
4439
- InstructionCost Size = getSizeOfLoop (L, TTI);
4440
- if (!Size.isValid ())
4446
+ if (!getLoopSize (L, TTI, 5 ))
4441
4447
return false ;
4442
4448
4443
4449
// Small search loops with multiple exits can be highly beneficial to unroll.
@@ -4452,7 +4458,7 @@ static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4452
4458
}))
4453
4459
return false ;
4454
4460
4455
- return *Size. getValue () < 6 ;
4461
+ return true ;
4456
4462
}
4457
4463
4458
4464
// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
@@ -4469,28 +4475,15 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4469
4475
if (!L->isInnermost () || L->getNumBlocks () > 8 )
4470
4476
return ;
4471
4477
4478
+ // This is handled by common code.
4479
+ if (!L->getExitBlock ())
4480
+ return ;
4481
+
4472
4482
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4473
4483
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4474
4484
(SE.getSmallConstantMaxTripCount (L) > 0 &&
4475
4485
SE.getSmallConstantMaxTripCount (L) <= 32 ))
4476
4486
return ;
4477
- if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4478
- return ;
4479
-
4480
- int64_t Size = 0 ;
4481
- for (auto *BB : L->getBlocks ()) {
4482
- for (auto &I : *BB) {
4483
- if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4484
- return ;
4485
- SmallVector<const Value *, 4 > Operands (I.operand_values ());
4486
- Size +=
4487
- *TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize).getValue ();
4488
- }
4489
- }
4490
-
4491
- // This is handled by common code.
4492
- if (!L->getExitBlock ())
4493
- return ;
4494
4487
4495
4488
if (SE.getSymbolicMaxBackedgeTakenCount (L) != SE.getBackedgeTakenCount (L))
4496
4489
return ;
@@ -4502,7 +4495,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4502
4495
// dependencies, to expose more parallel memory access streams.
4503
4496
BasicBlock *Header = L->getHeader ();
4504
4497
if (Header == L->getLoopLatch ()) {
4505
- if (Size > 8 )
4498
+ // Estimate the size of the loop.
4499
+ unsigned Size = getLoopSize (L, TTI, 8 );
4500
+ if (!Size)
4506
4501
return ;
4507
4502
4508
4503
SmallPtrSet<Value *, 8 > LoadedValues;
0 commit comments