Skip to content

Commit 8491d01

Browse files
committed
[AArch64] Lower vector trunc using tbl.
Similar to using tbl to lower vector ZExts, tbl4 can be used to lower vector truncates. The initial version support i32->i8 conversions. Depends on D120571 Reviewed By: t.p.northover Differential Revision: https://reviews.llvm.org/D133495
1 parent c7c0ce7 commit 8491d01

File tree

3 files changed

+162
-27
lines changed

3 files changed

+162
-27
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8047,8 +8047,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
80478047
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
80488048
return true;
80498049

8050-
if (isa<UIToFPInst>(I) && TLI->optimizeExtendOrTruncateConversion(
8051-
I, LI->getLoopFor(I->getParent())))
8050+
if ((isa<UIToFPInst>(I) || isa<TruncInst>(I)) &&
8051+
TLI->optimizeExtendOrTruncateConversion(I,
8052+
LI->getLoopFor(I->getParent())))
80528053
return true;
80538054

80548055
if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13209,6 +13209,44 @@ static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
1320913209
ZExt->eraseFromParent();
1321013210
}
1321113211

13212+
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
13213+
IRBuilder<> Builder(TI);
13214+
SmallVector<Value *> Parts;
13215+
Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
13216+
Parts.push_back(Builder.CreateBitCast(
13217+
Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy));
13218+
Parts.push_back(Builder.CreateBitCast(
13219+
Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy));
13220+
13221+
Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2;
13222+
unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
13223+
if (NumElements == 16) {
13224+
Parts.push_back(Builder.CreateBitCast(
13225+
Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy));
13226+
Parts.push_back(Builder.CreateBitCast(
13227+
Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}),
13228+
VecTy));
13229+
TblID = Intrinsic::aarch64_neon_tbl4;
13230+
}
13231+
SmallVector<Constant *, 16> MaskConst;
13232+
for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4)
13233+
MaskConst.push_back(
13234+
ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3));
13235+
13236+
for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4)
13237+
MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));
13238+
13239+
Parts.push_back(ConstantVector::get(MaskConst));
13240+
auto *F =
13241+
Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType());
13242+
Value *Res = Builder.CreateCall(F, Parts);
13243+
13244+
if (NumElements == 8)
13245+
Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7});
13246+
TI->replaceAllUsesWith(Res);
13247+
TI->eraseFromParent();
13248+
}
13249+
1321213250
bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
1321313251
Loop *L) const {
1321413252
// Try to optimize conversions using tbl. This requires materializing constant
@@ -13250,6 +13288,18 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
1325013288
createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
1325113289
return true;
1325213290
}
13291+
13292+
// Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
13293+
// instruction selecting the lowest 8 bits per lane of the input interpreted
13294+
// as 2 or 4 <4 x i32> vectors.
13295+
auto *TI = dyn_cast<TruncInst>(I);
13296+
if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13297+
SrcTy->getElementType()->isIntegerTy(32) &&
13298+
DstTy->getElementType()->isIntegerTy(8)) {
13299+
createTblForTrunc(TI, Subtarget->isLittleEndian());
13300+
return true;
13301+
}
13302+
1325313303
return false;
1325413304
}
1325513305

llvm/test/CodeGen/AArch64/trunc-to-tbl.ll

Lines changed: 109 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,49 +2,90 @@
22
; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
33
; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s
44

5+
; CHECK-LABEL: lCPI0_0:
6+
; CHECK-NEXT: .byte 0 ; 0x0
7+
; CHECK-NEXT: .byte 4 ; 0x4
8+
; CHECK-NEXT: .byte 8 ; 0x8
9+
; CHECK-NEXT: .byte 12 ; 0xc
10+
; CHECK-NEXT: .byte 16 ; 0x10
11+
; CHECK-NEXT: .byte 20 ; 0x14
12+
; CHECK-NEXT: .byte 24 ; 0x18
13+
; CHECK-NEXT: .byte 28 ; 0x1c
14+
; CHECK-NEXT: .byte 32 ; 0x20
15+
; CHECK-NEXT: .byte 36 ; 0x24
16+
; CHECK-NEXT: .byte 40 ; 0x28
17+
; CHECK-NEXT: .byte 44 ; 0x2c
18+
; CHECK-NEXT: .byte 48 ; 0x30
19+
; CHECK-NEXT: .byte 52 ; 0x34
20+
; CHECK-NEXT: .byte 56 ; 0x38
21+
; CHECK-NEXT: .byte 60 ; 0x3c
22+
23+
; CHECK-BE-LABEL: .LCPI0_0:
24+
; CHECK-BE-NEXT: .byte 3 // 0x3
25+
; CHECK-BE-NEXT: .byte 7 // 0x7
26+
; CHECK-BE-NEXT: .byte 11 // 0xb
27+
; CHECK-BE-NEXT: .byte 15 // 0xf
28+
; CHECK-BE-NEXT: .byte 19 // 0x13
29+
; CHECK-BE-NEXT: .byte 23 // 0x17
30+
; CHECK-BE-NEXT: .byte 27 // 0x1b
31+
; CHECK-BE-NEXT: .byte 31 // 0x1f
32+
; CHECK-BE-NEXT: .byte 35 // 0x23
33+
; CHECK-BE-NEXT: .byte 39 // 0x27
34+
; CHECK-BE-NEXT: .byte 43 // 0x2b
35+
; CHECK-BE-NEXT: .byte 47 // 0x2f
36+
; CHECK-BE-NEXT: .byte 51 // 0x33
37+
; CHECK-BE-NEXT: .byte 55 // 0x37
38+
; CHECK-BE-NEXT: .byte 59 // 0x3b
39+
; CHECK-BE-NEXT: .byte 63 // 0x3f
40+
541
; It's profitable to use a single tbl.4 instruction to lower the truncate.
642
define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
743
; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:
844
; CHECK: ; %bb.0: ; %entry
45+
; CHECK-NEXT: Lloh0:
46+
; CHECK-NEXT: adrp x9, lCPI0_0@PAGE
947
; CHECK-NEXT: mov x8, xzr
48+
; CHECK-NEXT: Lloh1:
49+
; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF]
1050
; CHECK-NEXT: LBB0_1: ; %loop
1151
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
1252
; CHECK-NEXT: add x9, x0, x8, lsl #6
13-
; CHECK-NEXT: ldp q1, q0, [x9, #32]
14-
; CHECK-NEXT: ldp q3, q2, [x9]
15-
; CHECK-NEXT: uzp1.8h v0, v1, v0
16-
; CHECK-NEXT: uzp1.8h v1, v3, v2
17-
; CHECK-NEXT: uzp1.16b v0, v1, v0
18-
; CHECK-NEXT: str q0, [x1, x8, lsl #4]
53+
; CHECK-NEXT: ldp q1, q2, [x9]
54+
; CHECK-NEXT: ldp q3, q4, [x9, #32]
55+
; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0
56+
; CHECK-NEXT: str q1, [x1, x8, lsl #4]
1957
; CHECK-NEXT: add x8, x8, #1
2058
; CHECK-NEXT: cmp x8, #1000
2159
; CHECK-NEXT: b.eq LBB0_1
2260
; CHECK-NEXT: ; %bb.2: ; %exit
2361
; CHECK-NEXT: ret
62+
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
2463
;
2564
; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_in_loop:
2665
; CHECK-BE: // %bb.0: // %entry
66+
; CHECK-BE-NEXT: adrp x8, .LCPI0_0
67+
; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0
68+
; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
2769
; CHECK-BE-NEXT: mov x8, xzr
2870
; CHECK-BE-NEXT: .LBB0_1: // %loop
2971
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
3072
; CHECK-BE-NEXT: add x9, x0, x8, lsl #6
31-
; CHECK-BE-NEXT: add x10, x9, #48
73+
; CHECK-BE-NEXT: add x10, x9, #16
3274
; CHECK-BE-NEXT: add x11, x9, #32
33-
; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]
34-
; CHECK-BE-NEXT: add x9, x9, #16
35-
; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]
36-
; CHECK-BE-NEXT: ld1 { v2.4s }, [x11]
37-
; CHECK-BE-NEXT: ld1 { v3.4s }, [x9]
75+
; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
76+
; CHECK-BE-NEXT: add x9, x9, #48
77+
; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
78+
; CHECK-BE-NEXT: ld1 { v3.16b }, [x11]
79+
; CHECK-BE-NEXT: ld1 { v4.16b }, [x9]
3880
; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
3981
; CHECK-BE-NEXT: add x8, x8, #1
4082
; CHECK-BE-NEXT: cmp x8, #1000
41-
; CHECK-BE-NEXT: uzp1 v1.8h, v2.8h, v1.8h
42-
; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
43-
; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
44-
; CHECK-BE-NEXT: st1 { v0.16b }, [x9]
83+
; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
84+
; CHECK-BE-NEXT: st1 { v1.16b }, [x9]
4585
; CHECK-BE-NEXT: b.eq .LBB0_1
4686
; CHECK-BE-NEXT: // %bb.2: // %exit
4787
; CHECK-BE-NEXT: ret
88+
4889
entry:
4990
br label %loop
5091

@@ -97,42 +138,85 @@ entry:
97138
ret void
98139
}
99140

141+
142+
; CHECK-LABEL: lCPI2_0:
143+
; CHECK-NEXT: .byte 0 ; 0x0
144+
; CHECK-NEXT: .byte 4 ; 0x4
145+
; CHECK-NEXT: .byte 8 ; 0x8
146+
; CHECK-NEXT: .byte 12 ; 0xc
147+
; CHECK-NEXT: .byte 16 ; 0x10
148+
; CHECK-NEXT: .byte 20 ; 0x14
149+
; CHECK-NEXT: .byte 24 ; 0x18
150+
; CHECK-NEXT: .byte 28 ; 0x1c
151+
; CHECK-NEXT: .byte 255 ; 0xff
152+
; CHECK-NEXT: .byte 255 ; 0xff
153+
; CHECK-NEXT: .byte 255 ; 0xff
154+
; CHECK-NEXT: .byte 255 ; 0xff
155+
; CHECK-NEXT: .byte 255 ; 0xff
156+
; CHECK-NEXT: .byte 255 ; 0xff
157+
; CHECK-NEXT: .byte 255 ; 0xff
158+
; CHECK-NEXT: .byte 255 ; 0xff
159+
160+
; CHECK-BE-LABEL: .LCPI2_0:
161+
; CHECK-BE-NEXT: .byte 3 // 0x3
162+
; CHECK-BE-NEXT: .byte 7 // 0x7
163+
; CHECK-BE-NEXT: .byte 11 // 0xb
164+
; CHECK-BE-NEXT: .byte 15 // 0xf
165+
; CHECK-BE-NEXT: .byte 19 // 0x13
166+
; CHECK-BE-NEXT: .byte 23 // 0x17
167+
; CHECK-BE-NEXT: .byte 27 // 0x1b
168+
; CHECK-BE-NEXT: .byte 31 // 0x1f
169+
; CHECK-BE-NEXT: .byte 255 // 0xff
170+
; CHECK-BE-NEXT: .byte 255 // 0xff
171+
; CHECK-BE-NEXT: .byte 255 // 0xff
172+
; CHECK-BE-NEXT: .byte 255 // 0xff
173+
; CHECK-BE-NEXT: .byte 255 // 0xff
174+
; CHECK-BE-NEXT: .byte 255 // 0xff
175+
; CHECK-BE-NEXT: .byte 255 // 0xff
176+
; CHECK-BE-NEXT: .byte 255 // 0xff
100177
; It's profitable to use a single tbl.2 instruction to lower the truncate.
101178
define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
102179
; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:
103180
; CHECK: ; %bb.0: ; %entry
181+
; CHECK-NEXT: Lloh2:
182+
; CHECK-NEXT: adrp x9, lCPI2_0@PAGE
104183
; CHECK-NEXT: mov x8, xzr
184+
; CHECK-NEXT: Lloh3:
185+
; CHECK-NEXT: ldr q0, [x9, lCPI2_0@PAGEOFF]
105186
; CHECK-NEXT: LBB2_1: ; %loop
106187
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
107188
; CHECK-NEXT: add x9, x0, x8, lsl #5
108-
; CHECK-NEXT: ldp q1, q0, [x9]
109-
; CHECK-NEXT: uzp1.8h v0, v1, v0
110-
; CHECK-NEXT: xtn.8b v0, v0
111-
; CHECK-NEXT: str d0, [x1, x8, lsl #3]
189+
; CHECK-NEXT: ldp q1, q2, [x9]
190+
; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0
191+
; CHECK-NEXT: str d1, [x1, x8, lsl #3]
112192
; CHECK-NEXT: add x8, x8, #1
113193
; CHECK-NEXT: cmp x8, #1000
114194
; CHECK-NEXT: b.eq LBB2_1
115195
; CHECK-NEXT: ; %bb.2: ; %exit
116196
; CHECK-NEXT: ret
197+
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
117198
;
118199
; CHECK-BE-LABEL: trunc_v8i32_to_v8i8_in_loop:
119200
; CHECK-BE: // %bb.0: // %entry
201+
; CHECK-BE-NEXT: adrp x8, .LCPI2_0
202+
; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI2_0
203+
; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
120204
; CHECK-BE-NEXT: mov x8, xzr
121205
; CHECK-BE-NEXT: .LBB2_1: // %loop
122206
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
123207
; CHECK-BE-NEXT: add x9, x0, x8, lsl #5
124208
; CHECK-BE-NEXT: add x10, x9, #16
125-
; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]
209+
; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
126210
; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
127211
; CHECK-BE-NEXT: add x8, x8, #1
128-
; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]
212+
; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
129213
; CHECK-BE-NEXT: cmp x8, #1000
130-
; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
131-
; CHECK-BE-NEXT: xtn v0.8b, v0.8h
132-
; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
214+
; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v0.16b
215+
; CHECK-BE-NEXT: st1 { v1.8b }, [x9]
133216
; CHECK-BE-NEXT: b.eq .LBB2_1
134217
; CHECK-BE-NEXT: // %bb.2: // %exit
135218
; CHECK-BE-NEXT: ret
219+
136220
entry:
137221
br label %loop
138222

0 commit comments

Comments
 (0)