@@ -93,6 +93,137 @@ declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vsca
93
93
declare <vscale x 8 x i32 > @llvm.experimental.vector.interleave2.nxv8i32 (<vscale x 4 x i32 >, <vscale x 4 x i32 >)
94
94
declare <vscale x 4 x i64 > @llvm.experimental.vector.interleave2.nxv4i64 (<vscale x 2 x i64 >, <vscale x 2 x i64 >)
95
95
96
+ define <vscale x 128 x i1 > @vector_interleave_nxv128i1_nxv64i1 (<vscale x 64 x i1 > %a , <vscale x 64 x i1 > %b ) {
97
+ ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
98
+ ; CHECK: # %bb.0:
99
+ ; CHECK-NEXT: vmv1r.v v9, v0
100
+ ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
101
+ ; CHECK-NEXT: vmv.v.i v24, 0
102
+ ; CHECK-NEXT: vmv1r.v v0, v8
103
+ ; CHECK-NEXT: vmerge.vim v16, v24, 1, v0
104
+ ; CHECK-NEXT: vmv1r.v v0, v9
105
+ ; CHECK-NEXT: vmerge.vim v8, v24, 1, v0
106
+ ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
107
+ ; CHECK-NEXT: vwaddu.vv v24, v8, v16
108
+ ; CHECK-NEXT: li a0, -1
109
+ ; CHECK-NEXT: vwmaccu.vx v24, a0, v16
110
+ ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
111
+ ; CHECK-NEXT: vmsne.vi v0, v24, 0
112
+ ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
113
+ ; CHECK-NEXT: vwaddu.vv v24, v12, v20
114
+ ; CHECK-NEXT: vwmaccu.vx v24, a0, v20
115
+ ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
116
+ ; CHECK-NEXT: vmsne.vi v8, v24, 0
117
+ ; CHECK-NEXT: ret
118
+ %res = call <vscale x 128 x i1 > @llvm.experimental.vector.interleave2.nxv128i1 (<vscale x 64 x i1 > %a , <vscale x 64 x i1 > %b )
119
+ ret <vscale x 128 x i1 > %res
120
+ }
121
+
122
+ define <vscale x 128 x i8 > @vector_interleave_nxv128i8_nxv64i8 (<vscale x 64 x i8 > %a , <vscale x 64 x i8 > %b ) {
123
+ ; CHECK-LABEL: vector_interleave_nxv128i8_nxv64i8:
124
+ ; CHECK: # %bb.0:
125
+ ; CHECK-NEXT: vmv8r.v v24, v8
126
+ ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
127
+ ; CHECK-NEXT: vwaddu.vv v8, v24, v16
128
+ ; CHECK-NEXT: li a0, -1
129
+ ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
130
+ ; CHECK-NEXT: vwaddu.vv v0, v28, v20
131
+ ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
132
+ ; CHECK-NEXT: vmv8r.v v16, v0
133
+ ; CHECK-NEXT: ret
134
+ %res = call <vscale x 128 x i8 > @llvm.experimental.vector.interleave2.nxv128i8 (<vscale x 64 x i8 > %a , <vscale x 64 x i8 > %b )
135
+ ret <vscale x 128 x i8 > %res
136
+ }
137
+
138
+ define <vscale x 64 x i16 > @vector_interleave_nxv64i16_nxv32i16 (<vscale x 32 x i16 > %a , <vscale x 32 x i16 > %b ) {
139
+ ; CHECK-LABEL: vector_interleave_nxv64i16_nxv32i16:
140
+ ; CHECK: # %bb.0:
141
+ ; CHECK-NEXT: vmv8r.v v24, v8
142
+ ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
143
+ ; CHECK-NEXT: vwaddu.vv v8, v24, v16
144
+ ; CHECK-NEXT: li a0, -1
145
+ ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
146
+ ; CHECK-NEXT: vwaddu.vv v0, v28, v20
147
+ ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
148
+ ; CHECK-NEXT: vmv8r.v v16, v0
149
+ ; CHECK-NEXT: ret
150
+ %res = call <vscale x 64 x i16 > @llvm.experimental.vector.interleave2.nxv64i16 (<vscale x 32 x i16 > %a , <vscale x 32 x i16 > %b )
151
+ ret <vscale x 64 x i16 > %res
152
+ }
153
+
154
+ define <vscale x 32 x i32 > @vector_interleave_nxv32i32_nxv16i32 (<vscale x 16 x i32 > %a , <vscale x 16 x i32 > %b ) {
155
+ ; CHECK-LABEL: vector_interleave_nxv32i32_nxv16i32:
156
+ ; CHECK: # %bb.0:
157
+ ; CHECK-NEXT: vmv8r.v v24, v8
158
+ ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
159
+ ; CHECK-NEXT: vwaddu.vv v8, v24, v16
160
+ ; CHECK-NEXT: li a0, -1
161
+ ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
162
+ ; CHECK-NEXT: vwaddu.vv v0, v28, v20
163
+ ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
164
+ ; CHECK-NEXT: vmv8r.v v16, v0
165
+ ; CHECK-NEXT: ret
166
+ %res = call <vscale x 32 x i32 > @llvm.experimental.vector.interleave2.nxv32i32 (<vscale x 16 x i32 > %a , <vscale x 16 x i32 > %b )
167
+ ret <vscale x 32 x i32 > %res
168
+ }
169
+
170
+ define <vscale x 16 x i64 > @vector_interleave_nxv16i64_nxv8i64 (<vscale x 8 x i64 > %a , <vscale x 8 x i64 > %b ) {
171
+ ; CHECK-LABEL: vector_interleave_nxv16i64_nxv8i64:
172
+ ; CHECK: # %bb.0:
173
+ ; CHECK-NEXT: addi sp, sp, -16
174
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
175
+ ; CHECK-NEXT: csrr a0, vlenb
176
+ ; CHECK-NEXT: slli a0, a0, 4
177
+ ; CHECK-NEXT: sub sp, sp, a0
178
+ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
179
+ ; CHECK-NEXT: csrr a0, vlenb
180
+ ; CHECK-NEXT: slli a0, a0, 3
181
+ ; CHECK-NEXT: add a0, sp, a0
182
+ ; CHECK-NEXT: addi a0, a0, 16
183
+ ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
184
+ ; CHECK-NEXT: csrr a0, vlenb
185
+ ; CHECK-NEXT: srli a0, a0, 1
186
+ ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu
187
+ ; CHECK-NEXT: vid.v v24
188
+ ; CHECK-NEXT: vand.vi v26, v24, 1
189
+ ; CHECK-NEXT: vmsne.vi v0, v26, 0
190
+ ; CHECK-NEXT: vsrl.vi v2, v24, 1
191
+ ; CHECK-NEXT: csrr a1, vlenb
192
+ ; CHECK-NEXT: slli a1, a1, 3
193
+ ; CHECK-NEXT: add a1, sp, a1
194
+ ; CHECK-NEXT: addi a1, a1, 16
195
+ ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
196
+ ; CHECK-NEXT: vadd.vx v2, v2, a0, v0.t
197
+ ; CHECK-NEXT: vmv4r.v v12, v16
198
+ ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
199
+ ; CHECK-NEXT: vrgatherei16.vv v24, v8, v2, v0.t
200
+ ; CHECK-NEXT: addi a0, sp, 16
201
+ ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
202
+ ; CHECK-NEXT: csrr a0, vlenb
203
+ ; CHECK-NEXT: slli a0, a0, 3
204
+ ; CHECK-NEXT: add a0, sp, a0
205
+ ; CHECK-NEXT: addi a0, a0, 16
206
+ ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
207
+ ; CHECK-NEXT: vmv4r.v v16, v12
208
+ ; CHECK-NEXT: vrgatherei16.vv v24, v16, v2, v0.t
209
+ ; CHECK-NEXT: addi a0, sp, 16
210
+ ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
211
+ ; CHECK-NEXT: vmv.v.v v16, v24
212
+ ; CHECK-NEXT: csrr a0, vlenb
213
+ ; CHECK-NEXT: slli a0, a0, 4
214
+ ; CHECK-NEXT: add sp, sp, a0
215
+ ; CHECK-NEXT: addi sp, sp, 16
216
+ ; CHECK-NEXT: ret
217
+ %res = call <vscale x 16 x i64 > @llvm.experimental.vector.interleave2.nxv16i64 (<vscale x 8 x i64 > %a , <vscale x 8 x i64 > %b )
218
+ ret <vscale x 16 x i64 > %res
219
+ }
220
+
221
+ declare <vscale x 128 x i1 > @llvm.experimental.vector.interleave2.nxv128i1 (<vscale x 64 x i1 >, <vscale x 64 x i1 >)
222
+ declare <vscale x 128 x i8 > @llvm.experimental.vector.interleave2.nxv128i8 (<vscale x 64 x i8 >, <vscale x 64 x i8 >)
223
+ declare <vscale x 64 x i16 > @llvm.experimental.vector.interleave2.nxv64i16 (<vscale x 32 x i16 >, <vscale x 32 x i16 >)
224
+ declare <vscale x 32 x i32 > @llvm.experimental.vector.interleave2.nxv32i32 (<vscale x 16 x i32 >, <vscale x 16 x i32 >)
225
+ declare <vscale x 16 x i64 > @llvm.experimental.vector.interleave2.nxv16i64 (<vscale x 8 x i64 >, <vscale x 8 x i64 >)
226
+
96
227
; Floats
97
228
98
229
define <vscale x 4 x half > @vector_interleave_nxv4f16_nxv2f16 (<vscale x 2 x half > %a , <vscale x 2 x half > %b ) {
@@ -193,3 +324,90 @@ declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vsca
193
324
declare <vscale x 16 x half > @llvm.experimental.vector.interleave2.nxv16f16 (<vscale x 8 x half >, <vscale x 8 x half >)
194
325
declare <vscale x 8 x float > @llvm.experimental.vector.interleave2.nxv8f32 (<vscale x 4 x float >, <vscale x 4 x float >)
195
326
declare <vscale x 4 x double > @llvm.experimental.vector.interleave2.nxv4f64 (<vscale x 2 x double >, <vscale x 2 x double >)
327
+
328
+ define <vscale x 64 x half > @vector_interleave_nxv64f16_nxv32f16 (<vscale x 32 x half > %a , <vscale x 32 x half > %b ) {
329
+ ; CHECK-LABEL: vector_interleave_nxv64f16_nxv32f16:
330
+ ; CHECK: # %bb.0:
331
+ ; CHECK-NEXT: vmv8r.v v24, v8
332
+ ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
333
+ ; CHECK-NEXT: vwaddu.vv v8, v24, v16
334
+ ; CHECK-NEXT: li a0, -1
335
+ ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
336
+ ; CHECK-NEXT: vwaddu.vv v0, v28, v20
337
+ ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
338
+ ; CHECK-NEXT: vmv8r.v v16, v0
339
+ ; CHECK-NEXT: ret
340
+ %res = call <vscale x 64 x half > @llvm.experimental.vector.interleave2.nxv64f16 (<vscale x 32 x half > %a , <vscale x 32 x half > %b )
341
+ ret <vscale x 64 x half > %res
342
+ }
343
+
344
+ define <vscale x 32 x float > @vector_interleave_nxv32f32_nxv16f32 (<vscale x 16 x float > %a , <vscale x 16 x float > %b ) {
345
+ ; CHECK-LABEL: vector_interleave_nxv32f32_nxv16f32:
346
+ ; CHECK: # %bb.0:
347
+ ; CHECK-NEXT: vmv8r.v v24, v8
348
+ ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
349
+ ; CHECK-NEXT: vwaddu.vv v8, v24, v16
350
+ ; CHECK-NEXT: li a0, -1
351
+ ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
352
+ ; CHECK-NEXT: vwaddu.vv v0, v28, v20
353
+ ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
354
+ ; CHECK-NEXT: vmv8r.v v16, v0
355
+ ; CHECK-NEXT: ret
356
+ %res = call <vscale x 32 x float > @llvm.experimental.vector.interleave2.nxv32f32 (<vscale x 16 x float > %a , <vscale x 16 x float > %b )
357
+ ret <vscale x 32 x float > %res
358
+ }
359
+
360
+ define <vscale x 16 x double > @vector_interleave_nxv16f64_nxv8f64 (<vscale x 8 x double > %a , <vscale x 8 x double > %b ) {
361
+ ; CHECK-LABEL: vector_interleave_nxv16f64_nxv8f64:
362
+ ; CHECK: # %bb.0:
363
+ ; CHECK-NEXT: addi sp, sp, -16
364
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
365
+ ; CHECK-NEXT: csrr a0, vlenb
366
+ ; CHECK-NEXT: slli a0, a0, 4
367
+ ; CHECK-NEXT: sub sp, sp, a0
368
+ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
369
+ ; CHECK-NEXT: csrr a0, vlenb
370
+ ; CHECK-NEXT: slli a0, a0, 3
371
+ ; CHECK-NEXT: add a0, sp, a0
372
+ ; CHECK-NEXT: addi a0, a0, 16
373
+ ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
374
+ ; CHECK-NEXT: csrr a0, vlenb
375
+ ; CHECK-NEXT: srli a0, a0, 1
376
+ ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu
377
+ ; CHECK-NEXT: vid.v v24
378
+ ; CHECK-NEXT: vand.vi v26, v24, 1
379
+ ; CHECK-NEXT: vmsne.vi v0, v26, 0
380
+ ; CHECK-NEXT: vsrl.vi v2, v24, 1
381
+ ; CHECK-NEXT: csrr a1, vlenb
382
+ ; CHECK-NEXT: slli a1, a1, 3
383
+ ; CHECK-NEXT: add a1, sp, a1
384
+ ; CHECK-NEXT: addi a1, a1, 16
385
+ ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
386
+ ; CHECK-NEXT: vadd.vx v2, v2, a0, v0.t
387
+ ; CHECK-NEXT: vmv4r.v v12, v16
388
+ ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
389
+ ; CHECK-NEXT: vrgatherei16.vv v24, v8, v2, v0.t
390
+ ; CHECK-NEXT: addi a0, sp, 16
391
+ ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
392
+ ; CHECK-NEXT: csrr a0, vlenb
393
+ ; CHECK-NEXT: slli a0, a0, 3
394
+ ; CHECK-NEXT: add a0, sp, a0
395
+ ; CHECK-NEXT: addi a0, a0, 16
396
+ ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
397
+ ; CHECK-NEXT: vmv4r.v v16, v12
398
+ ; CHECK-NEXT: vrgatherei16.vv v24, v16, v2, v0.t
399
+ ; CHECK-NEXT: addi a0, sp, 16
400
+ ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
401
+ ; CHECK-NEXT: vmv.v.v v16, v24
402
+ ; CHECK-NEXT: csrr a0, vlenb
403
+ ; CHECK-NEXT: slli a0, a0, 4
404
+ ; CHECK-NEXT: add sp, sp, a0
405
+ ; CHECK-NEXT: addi sp, sp, 16
406
+ ; CHECK-NEXT: ret
407
+ %res = call <vscale x 16 x double > @llvm.experimental.vector.interleave2.nxv16f64 (<vscale x 8 x double > %a , <vscale x 8 x double > %b )
408
+ ret <vscale x 16 x double > %res
409
+ }
410
+
411
+ declare <vscale x 64 x half > @llvm.experimental.vector.interleave2.nxv64f16 (<vscale x 32 x half >, <vscale x 32 x half >)
412
+ declare <vscale x 32 x float > @llvm.experimental.vector.interleave2.nxv32f32 (<vscale x 16 x float >, <vscale x 16 x float >)
413
+ declare <vscale x 16 x double > @llvm.experimental.vector.interleave2.nxv16f64 (<vscale x 8 x double >, <vscale x 8 x double >)
0 commit comments