Skip to content

Commit 81a11da

Browse files
committed
[CGP,AArch64] Replace zexts with shuffle that can be lowered using tbl.
This patch extends CodeGenPrepare to lower zext v16i8 -> v16i32 in loops using a wide shuffle creating a v64i8 vector, selecting groups of 3 zero elements and an element from the input. This is profitable on AArch64 where such shuffles can be lowered to tbl instructions, but only in loops, because it requires materializing 4 masks, which can be done in the loop preheader. This is the only reason the transform is part of CGP. If there's a better alternative I missed, please let me know. The same goes for the shouldReplaceZExtWithShuffle hook which guards this. I am not sure if this transform will be beneficial on other targets, but it seems like there is no way other convenient way. This improves the generated code for loops like the one below in combination with D96522. int foo(uint8_t *p, int N) { unsigned long long sum = 0; for (int i = 0; i < N ; i++, p++) { unsigned int v = *p; sum += (v < 127) ? v : 256 - v; } return sum; } https://clang.godbolt.org/z/Wco866MjY Reviewed By: t.p.northover Differential Revision: https://reviews.llvm.org/D120571
1 parent 6c9d2ee commit 81a11da

File tree

7 files changed

+416
-77
lines changed

7 files changed

+416
-77
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class Constant;
6767
class FastISel;
6868
class FunctionLoweringInfo;
6969
class GlobalValue;
70+
class Loop;
7071
class GISelKnownBits;
7172
class IntrinsicInst;
7273
class IRBuilderBase;
@@ -2798,6 +2799,13 @@ class TargetLoweringBase {
27982799
return false;
27992800
}
28002801

2802+
/// Try to optimize extending or truncating conversion instructions (like
2803+
/// zext, trunc, fptoui, uitofp) for the target.
2804+
virtual bool optimizeExtendOrTruncateConversion(Instruction *I,
2805+
Loop *L) const {
2806+
return false;
2807+
}
2808+
28012809
/// Return true if the target supplies and combines to a paired load
28022810
/// two loaded values of type LoadedType next to each other in memory.
28032811
/// RequiredAlignment gives the minimal alignment constraints that must be met

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8055,6 +8055,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
80558055
TargetLowering::TypeExpandInteger) {
80568056
return SinkCast(CI);
80578057
} else {
8058+
if (TLI->optimizeExtendOrTruncateConversion(
8059+
I, LI->getLoopFor(I->getParent())))
8060+
return true;
8061+
80588062
bool MadeChange = optimizeExt(I);
80598063
return MadeChange | optimizeExtUses(I);
80608064
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llvm/ADT/StringRef.h"
3030
#include "llvm/ADT/Triple.h"
3131
#include "llvm/ADT/Twine.h"
32+
#include "llvm/Analysis/LoopInfo.h"
3233
#include "llvm/Analysis/MemoryLocation.h"
3334
#include "llvm/Analysis/ObjCARCUtil.h"
3435
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -13183,6 +13184,60 @@ bool AArch64TargetLowering::shouldSinkOperands(
1318313184
return false;
1318413185
}
1318513186

13187+
static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
13188+
Value *Op = ZExt->getOperand(0);
13189+
auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType());
13190+
auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType());
13191+
unsigned NumElts = SrcTy->getNumElements();
13192+
IRBuilder<> Builder(ZExt);
13193+
SmallVector<int> Mask(4 * NumElts, NumElts);
13194+
// Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
13195+
// replace the original ZExt. This can later be lowered to a set of tbl
13196+
// instructions.
13197+
for (unsigned i = 0; i < NumElts; i++) {
13198+
if (IsLittleEndian)
13199+
Mask[i * 4] = i;
13200+
else
13201+
Mask[i * 4 + 3] = i;
13202+
}
13203+
13204+
auto *FirstEltZero = Builder.CreateInsertElement(
13205+
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
13206+
Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
13207+
Result = Builder.CreateBitCast(Result, DstTy);
13208+
ZExt->replaceAllUsesWith(Result);
13209+
ZExt->eraseFromParent();
13210+
}
13211+
13212+
bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
13213+
Loop *L) const {
13214+
// Try to optimize conversions using tbl. This requires materializing constant
13215+
// index vectors, which can increase code size and add loads. Skip the
13216+
// transform unless the conversion is in a loop block guaranteed to execute
13217+
// and we are not optimizing for size.
13218+
Function *F = I->getParent()->getParent();
13219+
if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
13220+
F->hasOptSize())
13221+
return false;
13222+
13223+
auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
13224+
auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
13225+
if (!SrcTy || !DstTy)
13226+
return false;
13227+
13228+
// Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
13229+
// lowered to either 2 or 4 tbl instructions to insert the original i8
13230+
// elements into i32 lanes.
13231+
auto *ZExt = dyn_cast<ZExtInst>(I);
13232+
if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13233+
SrcTy->getElementType()->isIntegerTy(8) &&
13234+
DstTy->getElementType()->isIntegerTy(32)) {
13235+
createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
13236+
return true;
13237+
}
13238+
return false;
13239+
}
13240+
1318613241
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
1318713242
Align &RequiredAligment) const {
1318813243
if (!LoadedType.isSimple() ||

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,9 @@ class AArch64TargetLowering : public TargetLowering {
606606
bool shouldSinkOperands(Instruction *I,
607607
SmallVectorImpl<Use *> &Ops) const override;
608608

609+
bool optimizeExtendOrTruncateConversion(Instruction *I,
610+
Loop *L) const override;
611+
609612
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
610613

611614
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

llvm/test/CodeGen/AArch64/vselect-ext.ll

Lines changed: 62 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -573,35 +573,53 @@ entry:
573573
define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) {
574574
; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32:
575575
; CHECK: ; %bb.0: ; %entry
576-
; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff
576+
; CHECK-NEXT: Lloh2:
577+
; CHECK-NEXT: adrp x9, lCPI24_0@PAGE
578+
; CHECK-NEXT: Lloh3:
579+
; CHECK-NEXT: adrp x10, lCPI24_1@PAGE
580+
; CHECK-NEXT: Lloh4:
581+
; CHECK-NEXT: adrp x11, lCPI24_2@PAGE
582+
; CHECK-NEXT: Lloh5:
583+
; CHECK-NEXT: adrp x12, lCPI24_3@PAGE
584+
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
577585
; CHECK-NEXT: mov x8, xzr
586+
; CHECK-NEXT: Lloh6:
587+
; CHECK-NEXT: ldr q0, [x9, lCPI24_0@PAGEOFF]
588+
; CHECK-NEXT: Lloh7:
589+
; CHECK-NEXT: ldr q1, [x10, lCPI24_1@PAGEOFF]
590+
; CHECK-NEXT: Lloh8:
591+
; CHECK-NEXT: ldr q3, [x11, lCPI24_2@PAGEOFF]
592+
; CHECK-NEXT: Lloh9:
593+
; CHECK-NEXT: ldr q4, [x12, lCPI24_3@PAGEOFF]
578594
; CHECK-NEXT: LBB24_1: ; %loop
579595
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
580-
; CHECK-NEXT: ldr q1, [x0, x8]
596+
; CHECK-NEXT: ldr q5, [x0, x8]
581597
; CHECK-NEXT: add x8, x8, #16
582598
; CHECK-NEXT: cmp x8, #128
583-
; CHECK-NEXT: cmgt.16b v2, v1, v0
584-
; CHECK-NEXT: ushll2.8h v3, v1, #0
585-
; CHECK-NEXT: sshll2.8h v4, v2, #0
586-
; CHECK-NEXT: ushll2.4s v5, v3, #0
587-
; CHECK-NEXT: ushll.4s v3, v3, #0
588-
; CHECK-NEXT: sshll2.4s v6, v4, #0
589-
; CHECK-NEXT: sshll.4s v4, v4, #0
590-
; CHECK-NEXT: ushll.8h v1, v1, #0
591-
; CHECK-NEXT: sshll.8h v2, v2, #0
599+
; CHECK-NEXT: cmgt.16b v6, v5, v2
600+
; CHECK-NEXT: tbl.16b v7, { v5 }, v0
601+
; CHECK-NEXT: tbl.16b v16, { v5 }, v1
602+
; CHECK-NEXT: sshll2.8h v18, v6, #0
603+
; CHECK-NEXT: tbl.16b v17, { v5 }, v3
604+
; CHECK-NEXT: sshll2.4s v19, v18, #0
605+
; CHECK-NEXT: sshll.4s v18, v18, #0
606+
; CHECK-NEXT: tbl.16b v5, { v5 }, v4
607+
; CHECK-NEXT: sshll.8h v6, v6, #0
608+
; CHECK-NEXT: and.16b v7, v7, v19
609+
; CHECK-NEXT: and.16b v16, v16, v18
610+
; CHECK-NEXT: stp q16, q7, [x1, #32]
611+
; CHECK-NEXT: sshll2.4s v7, v6, #0
612+
; CHECK-NEXT: sshll.4s v6, v6, #0
613+
; CHECK-NEXT: and.16b v7, v17, v7
592614
; CHECK-NEXT: and.16b v5, v5, v6
593-
; CHECK-NEXT: and.16b v3, v3, v4
594-
; CHECK-NEXT: stp q3, q5, [x1, #32]
595-
; CHECK-NEXT: sshll2.4s v4, v2, #0
596-
; CHECK-NEXT: sshll.4s v2, v2, #0
597-
; CHECK-NEXT: ushll2.4s v3, v1, #0
598-
; CHECK-NEXT: ushll.4s v1, v1, #0
599-
; CHECK-NEXT: and.16b v3, v3, v4
600-
; CHECK-NEXT: and.16b v1, v1, v2
601-
; CHECK-NEXT: stp q1, q3, [x1], #64
615+
; CHECK-NEXT: stp q5, q7, [x1], #64
602616
; CHECK-NEXT: b.ne LBB24_1
603617
; CHECK-NEXT: ; %bb.2: ; %exit
604618
; CHECK-NEXT: ret
619+
; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9
620+
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8
621+
; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
622+
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6
605623
entry:
606624
br label %loop
607625

@@ -627,23 +645,23 @@ exit:
627645
define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
628646
; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
629647
; CHECK: ; %bb.0: ; %entry
630-
; CHECK-NEXT: Lloh2:
648+
; CHECK-NEXT: Lloh10:
631649
; CHECK-NEXT: adrp x9, lCPI25_0@PAGE
632-
; CHECK-NEXT: Lloh3:
650+
; CHECK-NEXT: Lloh11:
633651
; CHECK-NEXT: adrp x10, lCPI25_1@PAGE
634-
; CHECK-NEXT: Lloh4:
652+
; CHECK-NEXT: Lloh12:
635653
; CHECK-NEXT: adrp x11, lCPI25_2@PAGE
636-
; CHECK-NEXT: Lloh5:
654+
; CHECK-NEXT: Lloh13:
637655
; CHECK-NEXT: adrp x12, lCPI25_3@PAGE
638656
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
639657
; CHECK-NEXT: mov x8, xzr
640-
; CHECK-NEXT: Lloh6:
658+
; CHECK-NEXT: Lloh14:
641659
; CHECK-NEXT: ldr q0, [x9, lCPI25_0@PAGEOFF]
642-
; CHECK-NEXT: Lloh7:
660+
; CHECK-NEXT: Lloh15:
643661
; CHECK-NEXT: ldr q1, [x10, lCPI25_1@PAGEOFF]
644-
; CHECK-NEXT: Lloh8:
662+
; CHECK-NEXT: Lloh16:
645663
; CHECK-NEXT: ldr q3, [x11, lCPI25_2@PAGEOFF]
646-
; CHECK-NEXT: Lloh9:
664+
; CHECK-NEXT: Lloh17:
647665
; CHECK-NEXT: ldr q4, [x12, lCPI25_3@PAGEOFF]
648666
; CHECK-NEXT: LBB25_1: ; %loop
649667
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -670,10 +688,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
670688
; CHECK-NEXT: b.ne LBB25_1
671689
; CHECK-NEXT: ; %bb.2: ; %exit
672690
; CHECK-NEXT: ret
673-
; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9
674-
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8
675-
; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
676-
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6
691+
; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17
692+
; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16
693+
; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
694+
; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14
677695
entry:
678696
br label %loop
679697

@@ -700,23 +718,23 @@ exit:
700718
define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
701719
; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
702720
; CHECK: ; %bb.0: ; %entry
703-
; CHECK-NEXT: Lloh10:
721+
; CHECK-NEXT: Lloh18:
704722
; CHECK-NEXT: adrp x9, lCPI26_0@PAGE
705-
; CHECK-NEXT: Lloh11:
723+
; CHECK-NEXT: Lloh19:
706724
; CHECK-NEXT: adrp x10, lCPI26_1@PAGE
707-
; CHECK-NEXT: Lloh12:
725+
; CHECK-NEXT: Lloh20:
708726
; CHECK-NEXT: adrp x11, lCPI26_2@PAGE
709-
; CHECK-NEXT: Lloh13:
727+
; CHECK-NEXT: Lloh21:
710728
; CHECK-NEXT: adrp x12, lCPI26_3@PAGE
711729
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
712730
; CHECK-NEXT: mov x8, xzr
713-
; CHECK-NEXT: Lloh14:
731+
; CHECK-NEXT: Lloh22:
714732
; CHECK-NEXT: ldr q0, [x9, lCPI26_0@PAGEOFF]
715-
; CHECK-NEXT: Lloh15:
733+
; CHECK-NEXT: Lloh23:
716734
; CHECK-NEXT: ldr q1, [x10, lCPI26_1@PAGEOFF]
717-
; CHECK-NEXT: Lloh16:
735+
; CHECK-NEXT: Lloh24:
718736
; CHECK-NEXT: ldr q3, [x11, lCPI26_2@PAGEOFF]
719-
; CHECK-NEXT: Lloh17:
737+
; CHECK-NEXT: Lloh25:
720738
; CHECK-NEXT: ldr q4, [x12, lCPI26_3@PAGEOFF]
721739
; CHECK-NEXT: LBB26_1: ; %loop
722740
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -743,10 +761,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
743761
; CHECK-NEXT: b.ne LBB26_1
744762
; CHECK-NEXT: ; %bb.2: ; %exit
745763
; CHECK-NEXT: ret
746-
; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17
747-
; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16
748-
; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
749-
; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14
764+
; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25
765+
; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24
766+
; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23
767+
; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22
750768
entry:
751769
br label %loop
752770

0 commit comments

Comments
 (0)