Skip to content

Commit 4fa9ce5

Browse files
committed
[AArch64] Use tbl for truncating vector FPtoUI conversions.
On AArch64, doing the vector truncate separately after the fptoui conversion can be lowered more efficiently using tbl.4, building on D133495. https://alive2.llvm.org/ce/z/T538CC Depends on D133495 Reviewed By: t.p.northover Differential Revision: https://reviews.llvm.org/D133496 (cherry-picked from 6b86b48)
1 parent 08dccd0 commit 4fa9ce5

File tree

3 files changed

+221
-125
lines changed

3 files changed

+221
-125
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7800,7 +7800,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
78007800
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
78017801
return true;
78027802

7803-
if ((isa<UIToFPInst>(I) || isa<TruncInst>(I)) &&
7803+
if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
78047804
TLI->optimizeExtendOrTruncateConversion(I,
78057805
LI->getLoopFor(I->getParent())))
78067806
return true;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12961,6 +12961,23 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
1296112961
return true;
1296212962
}
1296312963

12964+
// Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
12965+
// followed by a truncate lowered to using tbl.4.
12966+
auto *FPToUI = dyn_cast<FPToUIInst>(I);
12967+
if (FPToUI &&
12968+
(SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
12969+
SrcTy->getElementType()->isFloatTy() &&
12970+
DstTy->getElementType()->isIntegerTy(8)) {
12971+
IRBuilder<> Builder(I);
12972+
auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
12973+
VectorType::getInteger(SrcTy));
12974+
auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
12975+
I->replaceAllUsesWith(TruncI);
12976+
I->eraseFromParent();
12977+
createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
12978+
return true;
12979+
}
12980+
1296412981
// Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
1296512982
// instruction selecting the lowest 8 bits per lane of the input interpreted
1296612983
// as 2 or 4 <4 x i32> vectors.

0 commit comments

Comments
 (0)