Skip to content

Commit 6b86b48

Browse files
committed
[AArch64] Use tbl for truncating vector FPtoUI conversions.
On AArch64, doing the vector truncate separately after the fptoui conversion can be lowered more efficiently using tbl.4, building on D133495. https://alive2.llvm.org/ce/z/T538CC Depends on D133495 Reviewed By: t.p.northover Differential Revision: https://reviews.llvm.org/D133496
1 parent e596422 commit 6b86b48

File tree

3 files changed

+221
-125
lines changed

3 files changed

+221
-125
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8047,7 +8047,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
80478047
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
80488048
return true;
80498049

8050-
if ((isa<UIToFPInst>(I) || isa<TruncInst>(I)) &&
8050+
if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
80518051
TLI->optimizeExtendOrTruncateConversion(I,
80528052
LI->getLoopFor(I->getParent())))
80538053
return true;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13289,6 +13289,23 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
1328913289
return true;
1329013290
}
1329113291

13292+
// Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
13293+
// followed by a truncate lowered to using tbl.4.
13294+
auto *FPToUI = dyn_cast<FPToUIInst>(I);
13295+
if (FPToUI &&
13296+
(SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13297+
SrcTy->getElementType()->isFloatTy() &&
13298+
DstTy->getElementType()->isIntegerTy(8)) {
13299+
IRBuilder<> Builder(I);
13300+
auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
13301+
VectorType::getInteger(SrcTy));
13302+
auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
13303+
I->replaceAllUsesWith(TruncI);
13304+
I->eraseFromParent();
13305+
createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
13306+
return true;
13307+
}
13308+
1329213309
// Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
1329313310
// instruction selecting the lowest 8 bits per lane of the input interpreted
1329413311
// as 2 or 4 <4 x i32> vectors.

0 commit comments

Comments
 (0)