Skip to content

Commit a48cd3d

Browse files
committed
[AArch64] Add tests for lowering trunc to i8 using tbl.
(cherry-picked from 39fcb4a)
1 parent 62ed9b7 commit a48cd3d

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -o - %s | FileCheck %s
3+
4+
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
5+
target triple = "arm64-apple-ios"
6+
7+
; It's profitable to use a single tbl.4 instruction to lower the truncate.
8+
define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
9+
; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:
10+
; CHECK: ; %bb.0: ; %entry
11+
; CHECK-NEXT: mov x8, xzr
12+
; CHECK-NEXT: LBB0_1: ; %loop
13+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
14+
; CHECK-NEXT: add x9, x0, x8, lsl #6
15+
; CHECK-NEXT: ldp q1, q0, [x9, #32]
16+
; CHECK-NEXT: ldp q3, q2, [x9]
17+
; CHECK-NEXT: uzp1.8h v0, v1, v0
18+
; CHECK-NEXT: uzp1.8h v1, v3, v2
19+
; CHECK-NEXT: uzp1.16b v0, v1, v0
20+
; CHECK-NEXT: str q0, [x1, x8, lsl #4]
21+
; CHECK-NEXT: add x8, x8, #1
22+
; CHECK-NEXT: cmp x8, #1000
23+
; CHECK-NEXT: b.eq LBB0_1
24+
; CHECK-NEXT: ; %bb.2: ; %exit
25+
; CHECK-NEXT: ret
26+
entry:
27+
br label %loop
28+
29+
loop:
30+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
31+
%gep.A = getelementptr inbounds <16 x i32>, ptr %A, i64 %iv
32+
%l.A = load <16 x i32>, ptr %gep.A
33+
%trunc = trunc <16 x i32> %l.A to <16 x i8>
34+
%gep.dst = getelementptr inbounds <16 x i8>, ptr %dst, i64 %iv
35+
store <16 x i8> %trunc, ptr %gep.dst
36+
%iv.next = add i64 %iv, 1
37+
%ec = icmp eq i64 %iv.next, 1000
38+
br i1 %ec, label %loop, label %exit
39+
40+
exit:
41+
ret void
42+
}
43+
44+
; Not profitable to use tbl, as materializing the masks requires more
45+
; instructions.
46+
define void @trunc_v16i32_to_v16i8_no_loop(ptr %A, ptr %dst) {
47+
; CHECK-LABEL: trunc_v16i32_to_v16i8_no_loop:
48+
; CHECK: ; %bb.0: ; %entry
49+
; CHECK-NEXT: ldp q1, q0, [x0, #32]
50+
; CHECK-NEXT: ldp q3, q2, [x0]
51+
; CHECK-NEXT: uzp1.8h v0, v1, v0
52+
; CHECK-NEXT: uzp1.8h v1, v3, v2
53+
; CHECK-NEXT: uzp1.16b v0, v1, v0
54+
; CHECK-NEXT: str q0, [x1]
55+
; CHECK-NEXT: ret
56+
entry:
57+
%l.A = load <16 x i32>, ptr %A
58+
%trunc = trunc <16 x i32> %l.A to <16 x i8>
59+
store <16 x i8> %trunc, ptr %dst
60+
ret void
61+
}
62+
63+
; It's profitable to use a single tbl.2 instruction to lower the truncate.
64+
define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
65+
; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:
66+
; CHECK: ; %bb.0: ; %entry
67+
; CHECK-NEXT: mov x8, xzr
68+
; CHECK-NEXT: LBB2_1: ; %loop
69+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
70+
; CHECK-NEXT: add x9, x0, x8, lsl #5
71+
; CHECK-NEXT: ldp q1, q0, [x9]
72+
; CHECK-NEXT: uzp1.8h v0, v1, v0
73+
; CHECK-NEXT: xtn.8b v0, v0
74+
; CHECK-NEXT: str d0, [x1, x8, lsl #3]
75+
; CHECK-NEXT: add x8, x8, #1
76+
; CHECK-NEXT: cmp x8, #1000
77+
; CHECK-NEXT: b.eq LBB2_1
78+
; CHECK-NEXT: ; %bb.2: ; %exit
79+
; CHECK-NEXT: ret
80+
entry:
81+
br label %loop
82+
83+
loop:
84+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
85+
%gep.A = getelementptr inbounds <8 x i32>, ptr %A, i64 %iv
86+
%l.A = load <8 x i32>, ptr %gep.A
87+
%trunc = trunc <8 x i32> %l.A to <8 x i8>
88+
%gep.dst = getelementptr inbounds <8 x i8>, ptr %dst, i64 %iv
89+
store <8 x i8> %trunc, ptr %gep.dst
90+
%iv.next = add i64 %iv, 1
91+
%ec = icmp eq i64 %iv.next, 1000
92+
br i1 %ec, label %loop, label %exit
93+
94+
exit:
95+
ret void
96+
}

0 commit comments

Comments
 (0)