|
2 | 2 | ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
|
3 | 3 | ; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s
|
4 | 4 |
|
| 5 | +; CHECK-LABEL: lCPI0_0: |
| 6 | +; CHECK-NEXT: .byte 0 ; 0x0 |
| 7 | +; CHECK-NEXT: .byte 4 ; 0x4 |
| 8 | +; CHECK-NEXT: .byte 8 ; 0x8 |
| 9 | +; CHECK-NEXT: .byte 12 ; 0xc |
| 10 | +; CHECK-NEXT: .byte 16 ; 0x10 |
| 11 | +; CHECK-NEXT: .byte 20 ; 0x14 |
| 12 | +; CHECK-NEXT: .byte 24 ; 0x18 |
| 13 | +; CHECK-NEXT: .byte 28 ; 0x1c |
| 14 | +; CHECK-NEXT: .byte 32 ; 0x20 |
| 15 | +; CHECK-NEXT: .byte 36 ; 0x24 |
| 16 | +; CHECK-NEXT: .byte 40 ; 0x28 |
| 17 | +; CHECK-NEXT: .byte 44 ; 0x2c |
| 18 | +; CHECK-NEXT: .byte 48 ; 0x30 |
| 19 | +; CHECK-NEXT: .byte 52 ; 0x34 |
| 20 | +; CHECK-NEXT: .byte 56 ; 0x38 |
| 21 | +; CHECK-NEXT: .byte 60 ; 0x3c |
| 22 | + |
| 23 | +; CHECK-BE-LABEL: .LCPI0_0: |
| 24 | +; CHECK-BE-NEXT: .byte 3 // 0x3 |
| 25 | +; CHECK-BE-NEXT: .byte 7 // 0x7 |
| 26 | +; CHECK-BE-NEXT: .byte 11 // 0xb |
| 27 | +; CHECK-BE-NEXT: .byte 15 // 0xf |
| 28 | +; CHECK-BE-NEXT: .byte 19 // 0x13 |
| 29 | +; CHECK-BE-NEXT: .byte 23 // 0x17 |
| 30 | +; CHECK-BE-NEXT: .byte 27 // 0x1b |
| 31 | +; CHECK-BE-NEXT: .byte 31 // 0x1f |
| 32 | +; CHECK-BE-NEXT: .byte 35 // 0x23 |
| 33 | +; CHECK-BE-NEXT: .byte 39 // 0x27 |
| 34 | +; CHECK-BE-NEXT: .byte 43 // 0x2b |
| 35 | +; CHECK-BE-NEXT: .byte 47 // 0x2f |
| 36 | +; CHECK-BE-NEXT: .byte 51 // 0x33 |
| 37 | +; CHECK-BE-NEXT: .byte 55 // 0x37 |
| 38 | +; CHECK-BE-NEXT: .byte 59 // 0x3b |
| 39 | +; CHECK-BE-NEXT: .byte 63 // 0x3f |
| 40 | + |
5 | 41 | ; It's profitable to use a single tbl.4 instruction to lower the truncate.
|
6 | 42 | define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
|
7 | 43 | ; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:
|
8 | 44 | ; CHECK: ; %bb.0: ; %entry
|
| 45 | +; CHECK-NEXT: Lloh0: |
| 46 | +; CHECK-NEXT: adrp x9, lCPI0_0@PAGE |
9 | 47 | ; CHECK-NEXT: mov x8, xzr
|
| 48 | +; CHECK-NEXT: Lloh1: |
| 49 | +; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF] |
10 | 50 | ; CHECK-NEXT: LBB0_1: ; %loop
|
11 | 51 | ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
12 | 52 | ; CHECK-NEXT: add x9, x0, x8, lsl #6
|
13 |
| -; CHECK-NEXT: ldp q1, q0, [x9, #32] |
14 |
| -; CHECK-NEXT: ldp q3, q2, [x9] |
15 |
| -; CHECK-NEXT: uzp1.8h v0, v1, v0 |
16 |
| -; CHECK-NEXT: uzp1.8h v1, v3, v2 |
17 |
| -; CHECK-NEXT: uzp1.16b v0, v1, v0 |
18 |
| -; CHECK-NEXT: str q0, [x1, x8, lsl #4] |
| 53 | +; CHECK-NEXT: ldp q1, q2, [x9] |
| 54 | +; CHECK-NEXT: ldp q3, q4, [x9, #32] |
| 55 | +; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0 |
| 56 | +; CHECK-NEXT: str q1, [x1, x8, lsl #4] |
19 | 57 | ; CHECK-NEXT: add x8, x8, #1
|
20 | 58 | ; CHECK-NEXT: cmp x8, #1000
|
21 | 59 | ; CHECK-NEXT: b.eq LBB0_1
|
22 | 60 | ; CHECK-NEXT: ; %bb.2: ; %exit
|
23 | 61 | ; CHECK-NEXT: ret
|
| 62 | +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 |
24 | 63 | ;
|
25 | 64 | ; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_in_loop:
|
26 | 65 | ; CHECK-BE: // %bb.0: // %entry
|
| 66 | +; CHECK-BE-NEXT: adrp x8, .LCPI0_0 |
| 67 | +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0 |
| 68 | +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] |
27 | 69 | ; CHECK-BE-NEXT: mov x8, xzr
|
28 | 70 | ; CHECK-BE-NEXT: .LBB0_1: // %loop
|
29 | 71 | ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
|
30 | 72 | ; CHECK-BE-NEXT: add x9, x0, x8, lsl #6
|
31 |
| -; CHECK-BE-NEXT: add x10, x9, #48 |
| 73 | +; CHECK-BE-NEXT: add x10, x9, #16 |
32 | 74 | ; CHECK-BE-NEXT: add x11, x9, #32
|
33 |
| -; CHECK-BE-NEXT: ld1 { v0.4s }, [x9] |
34 |
| -; CHECK-BE-NEXT: add x9, x9, #16 |
35 |
| -; CHECK-BE-NEXT: ld1 { v1.4s }, [x10] |
36 |
| -; CHECK-BE-NEXT: ld1 { v2.4s }, [x11] |
37 |
| -; CHECK-BE-NEXT: ld1 { v3.4s }, [x9] |
| 75 | +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] |
| 76 | +; CHECK-BE-NEXT: add x9, x9, #48 |
| 77 | +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] |
| 78 | +; CHECK-BE-NEXT: ld1 { v3.16b }, [x11] |
| 79 | +; CHECK-BE-NEXT: ld1 { v4.16b }, [x9] |
38 | 80 | ; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
|
39 | 81 | ; CHECK-BE-NEXT: add x8, x8, #1
|
40 | 82 | ; CHECK-BE-NEXT: cmp x8, #1000
|
41 |
| -; CHECK-BE-NEXT: uzp1 v1.8h, v2.8h, v1.8h |
42 |
| -; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v3.8h |
43 |
| -; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b |
44 |
| -; CHECK-BE-NEXT: st1 { v0.16b }, [x9] |
| 83 | +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b |
| 84 | +; CHECK-BE-NEXT: st1 { v1.16b }, [x9] |
45 | 85 | ; CHECK-BE-NEXT: b.eq .LBB0_1
|
46 | 86 | ; CHECK-BE-NEXT: // %bb.2: // %exit
|
47 | 87 | ; CHECK-BE-NEXT: ret
|
| 88 | + |
48 | 89 | entry:
|
49 | 90 | br label %loop
|
50 | 91 |
|
@@ -97,42 +138,85 @@ entry:
|
97 | 138 | ret void
|
98 | 139 | }
|
99 | 140 |
|
| 141 | + |
| 142 | +; CHECK-LABEL: lCPI2_0: |
| 143 | +; CHECK-NEXT: .byte 0 ; 0x0 |
| 144 | +; CHECK-NEXT: .byte 4 ; 0x4 |
| 145 | +; CHECK-NEXT: .byte 8 ; 0x8 |
| 146 | +; CHECK-NEXT: .byte 12 ; 0xc |
| 147 | +; CHECK-NEXT: .byte 16 ; 0x10 |
| 148 | +; CHECK-NEXT: .byte 20 ; 0x14 |
| 149 | +; CHECK-NEXT: .byte 24 ; 0x18 |
| 150 | +; CHECK-NEXT: .byte 28 ; 0x1c |
| 151 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 152 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 153 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 154 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 155 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 156 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 157 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 158 | +; CHECK-NEXT: .byte 255 ; 0xff |
| 159 | + |
| 160 | +; CHECK-BE-LABEL: .LCPI2_0: |
| 161 | +; CHECK-BE-NEXT: .byte 3 // 0x3 |
| 162 | +; CHECK-BE-NEXT: .byte 7 // 0x7 |
| 163 | +; CHECK-BE-NEXT: .byte 11 // 0xb |
| 164 | +; CHECK-BE-NEXT: .byte 15 // 0xf |
| 165 | +; CHECK-BE-NEXT: .byte 19 // 0x13 |
| 166 | +; CHECK-BE-NEXT: .byte 23 // 0x17 |
| 167 | +; CHECK-BE-NEXT: .byte 27 // 0x1b |
| 168 | +; CHECK-BE-NEXT: .byte 31 // 0x1f |
| 169 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 170 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 171 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 172 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 173 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 174 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 175 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
| 176 | +; CHECK-BE-NEXT: .byte 255 // 0xff |
100 | 177 | ; It's profitable to use a single tbl.2 instruction to lower the truncate.
|
101 | 178 | define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
|
102 | 179 | ; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:
|
103 | 180 | ; CHECK: ; %bb.0: ; %entry
|
| 181 | +; CHECK-NEXT: Lloh2: |
| 182 | +; CHECK-NEXT: adrp x9, lCPI2_0@PAGE |
104 | 183 | ; CHECK-NEXT: mov x8, xzr
|
| 184 | +; CHECK-NEXT: Lloh3: |
| 185 | +; CHECK-NEXT: ldr q0, [x9, lCPI2_0@PAGEOFF] |
105 | 186 | ; CHECK-NEXT: LBB2_1: ; %loop
|
106 | 187 | ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
107 | 188 | ; CHECK-NEXT: add x9, x0, x8, lsl #5
|
108 |
| -; CHECK-NEXT: ldp q1, q0, [x9] |
109 |
| -; CHECK-NEXT: uzp1.8h v0, v1, v0 |
110 |
| -; CHECK-NEXT: xtn.8b v0, v0 |
111 |
| -; CHECK-NEXT: str d0, [x1, x8, lsl #3] |
| 189 | +; CHECK-NEXT: ldp q1, q2, [x9] |
| 190 | +; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0 |
| 191 | +; CHECK-NEXT: str d1, [x1, x8, lsl #3] |
112 | 192 | ; CHECK-NEXT: add x8, x8, #1
|
113 | 193 | ; CHECK-NEXT: cmp x8, #1000
|
114 | 194 | ; CHECK-NEXT: b.eq LBB2_1
|
115 | 195 | ; CHECK-NEXT: ; %bb.2: ; %exit
|
116 | 196 | ; CHECK-NEXT: ret
|
| 197 | +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 |
117 | 198 | ;
|
118 | 199 | ; CHECK-BE-LABEL: trunc_v8i32_to_v8i8_in_loop:
|
119 | 200 | ; CHECK-BE: // %bb.0: // %entry
|
| 201 | +; CHECK-BE-NEXT: adrp x8, .LCPI2_0 |
| 202 | +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI2_0 |
| 203 | +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] |
120 | 204 | ; CHECK-BE-NEXT: mov x8, xzr
|
121 | 205 | ; CHECK-BE-NEXT: .LBB2_1: // %loop
|
122 | 206 | ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
|
123 | 207 | ; CHECK-BE-NEXT: add x9, x0, x8, lsl #5
|
124 | 208 | ; CHECK-BE-NEXT: add x10, x9, #16
|
125 |
| -; CHECK-BE-NEXT: ld1 { v0.4s }, [x9] |
| 209 | +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] |
126 | 210 | ; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
|
127 | 211 | ; CHECK-BE-NEXT: add x8, x8, #1
|
128 |
| -; CHECK-BE-NEXT: ld1 { v1.4s }, [x10] |
| 212 | +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] |
129 | 213 | ; CHECK-BE-NEXT: cmp x8, #1000
|
130 |
| -; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h |
131 |
| -; CHECK-BE-NEXT: xtn v0.8b, v0.8h |
132 |
| -; CHECK-BE-NEXT: st1 { v0.8b }, [x9] |
| 214 | +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v0.16b |
| 215 | +; CHECK-BE-NEXT: st1 { v1.8b }, [x9] |
133 | 216 | ; CHECK-BE-NEXT: b.eq .LBB2_1
|
134 | 217 | ; CHECK-BE-NEXT: // %bb.2: // %exit
|
135 | 218 | ; CHECK-BE-NEXT: ret
|
| 219 | + |
136 | 220 | entry:
|
137 | 221 | br label %loop
|
138 | 222 |
|
|
0 commit comments