@@ -48,9 +48,8 @@ define <4 x float> @fcvt_v4f16_v4f32(<4 x half> %op1) #0 {
48
48
define void @fcvt_v8f16_v8f32 (<8 x half >* %a , <8 x float >* %b ) #0 {
49
49
; CHECK-LABEL: fcvt_v8f16_v8f32:
50
50
; CHECK: // %bb.0:
51
- ; CHECK-NEXT: ldr q0, [x0]
52
51
; CHECK-NEXT: ptrue p0.s, vl8
53
- ; CHECK-NEXT: uunpklo z0.s, z0.h
52
+ ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
54
53
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
55
54
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
56
55
; CHECK-NEXT: ret
@@ -76,16 +75,15 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
76
75
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]
77
76
; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
78
77
; VBITS_EQ_256-NEXT: ret
79
-
78
+ ;
80
79
; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
81
80
; VBITS_GE_512: // %bb.0:
82
- ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
83
- ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
84
81
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
85
- ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
82
+ ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
86
83
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
87
84
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
88
85
; VBITS_GE_512-NEXT: ret
86
+
89
87
%op1 = load <16 x half >, <16 x half >* %a
90
88
%res = fpext <16 x half > %op1 to <16 x float >
91
89
store <16 x float > %res , <16 x float >* %b
@@ -95,10 +93,8 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
95
93
define void @fcvt_v32f16_v32f32 (<32 x half >* %a , <32 x float >* %b ) #0 {
96
94
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
97
95
; VBITS_GE_1024: // %bb.0:
98
- ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
99
- ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
100
96
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
101
- ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
97
+ ; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
102
98
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
103
99
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
104
100
; VBITS_GE_1024-NEXT: ret
@@ -111,10 +107,8 @@ define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
111
107
define void @fcvt_v64f16_v64f32 (<64 x half >* %a , <64 x float >* %b ) #0 {
112
108
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
113
109
; VBITS_GE_2048: // %bb.0:
114
- ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
115
- ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
116
110
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
117
- ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
111
+ ; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
118
112
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
119
113
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
120
114
; VBITS_GE_2048-NEXT: ret
@@ -156,10 +150,8 @@ define <2 x double> @fcvt_v2f16_v2f64(<2 x half> %op1) #0 {
156
150
define void @fcvt_v4f16_v4f64 (<4 x half >* %a , <4 x double >* %b ) #0 {
157
151
; CHECK-LABEL: fcvt_v4f16_v4f64:
158
152
; CHECK: // %bb.0:
159
- ; CHECK-NEXT: ldr d0, [x0]
160
153
; CHECK-NEXT: ptrue p0.d, vl4
161
- ; CHECK-NEXT: uunpklo z0.s, z0.h
162
- ; CHECK-NEXT: uunpklo z0.d, z0.s
154
+ ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
163
155
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
164
156
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
165
157
; CHECK-NEXT: ret
@@ -170,7 +162,6 @@ define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
170
162
}
171
163
172
164
define void @fcvt_v8f16_v8f64 (<8 x half >* %a , <8 x double >* %b ) #0 {
173
- ; Ensure sensible type legalisation.
174
165
; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
175
166
; VBITS_EQ_256: // %bb.0:
176
167
; VBITS_EQ_256-NEXT: ldr q0, [x0]
@@ -186,16 +177,15 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
186
177
; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h
187
178
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
188
179
; VBITS_EQ_256-NEXT: ret
189
-
180
+ ;
190
181
; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
191
182
; VBITS_GE_512: // %bb.0:
192
- ; VBITS_GE_512-NEXT: ldr q0, [x0]
193
183
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
194
- ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
195
- ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
184
+ ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x0]
196
185
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
197
186
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
198
187
; VBITS_GE_512-NEXT: ret
188
+
199
189
%op1 = load <8 x half >, <8 x half >* %a
200
190
%res = fpext <8 x half > %op1 to <8 x double >
201
191
store <8 x double > %res , <8 x double >* %b
@@ -205,11 +195,8 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
205
195
define void @fcvt_v16f16_v16f64 (<16 x half >* %a , <16 x double >* %b ) #0 {
206
196
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
207
197
; VBITS_GE_1024: // %bb.0:
208
- ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
209
- ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
210
198
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
211
- ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
212
- ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
199
+ ; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0]
213
200
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
214
201
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
215
202
; VBITS_GE_1024-NEXT: ret
@@ -222,11 +209,8 @@ define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
222
209
define void @fcvt_v32f16_v32f64 (<32 x half >* %a , <32 x double >* %b ) #0 {
223
210
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
224
211
; VBITS_GE_2048: // %bb.0:
225
- ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
226
- ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
227
212
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
228
- ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
229
- ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
213
+ ; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
230
214
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
231
215
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
232
216
; VBITS_GE_2048-NEXT: ret
@@ -264,9 +248,8 @@ define <2 x double> @fcvt_v2f32_v2f64(<2 x float> %op1) #0 {
264
248
define void @fcvt_v4f32_v4f64 (<4 x float >* %a , <4 x double >* %b ) #0 {
265
249
; CHECK-LABEL: fcvt_v4f32_v4f64:
266
250
; CHECK: // %bb.0:
267
- ; CHECK-NEXT: ldr q0, [x0]
268
251
; CHECK-NEXT: ptrue p0.d, vl4
269
- ; CHECK-NEXT: uunpklo z0.d, z0.s
252
+ ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
270
253
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
271
254
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
272
255
; CHECK-NEXT: ret
@@ -292,16 +275,15 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
292
275
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1]
293
276
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
294
277
; VBITS_EQ_256-NEXT: ret
295
-
278
+ ;
296
279
; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
297
280
; VBITS_GE_512: // %bb.0:
298
- ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
299
- ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
300
281
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
301
- ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
282
+ ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
302
283
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s
303
284
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
304
285
; VBITS_GE_512-NEXT: ret
286
+
305
287
%op1 = load <8 x float >, <8 x float >* %a
306
288
%res = fpext <8 x float > %op1 to <8 x double >
307
289
store <8 x double > %res , <8 x double >* %b
@@ -311,10 +293,8 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
311
293
define void @fcvt_v16f32_v16f64 (<16 x float >* %a , <16 x double >* %b ) #0 {
312
294
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
313
295
; VBITS_GE_1024: // %bb.0:
314
- ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
315
- ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
316
296
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
317
- ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
297
+ ; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0]
318
298
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
319
299
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
320
300
; VBITS_GE_1024-NEXT: ret
@@ -327,10 +307,8 @@ define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
327
307
define void @fcvt_v32f32_v32f64 (<32 x float >* %a , <32 x double >* %b ) #0 {
328
308
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
329
309
; VBITS_GE_2048: // %bb.0:
330
- ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
331
- ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
332
310
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
333
- ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
311
+ ; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
334
312
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
335
313
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
336
314
; VBITS_GE_2048-NEXT: ret
@@ -403,11 +381,8 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
403
381
; VBITS_GE_512: // %bb.0:
404
382
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
405
383
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
406
- ; VBITS_GE_512-NEXT: ptrue p0.s
407
384
; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s
408
- ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
409
- ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
410
- ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
385
+ ; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
411
386
; VBITS_GE_512-NEXT: ret
412
387
%op1 = load <16 x float >, <16 x float >* %a
413
388
%res = fptrunc <16 x float > %op1 to <16 x half >
@@ -420,11 +395,8 @@ define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
420
395
; VBITS_GE_1024: // %bb.0:
421
396
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
422
397
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
423
- ; VBITS_GE_1024-NEXT: ptrue p0.s
424
398
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
425
- ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
426
- ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
427
- ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
399
+ ; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
428
400
; VBITS_GE_1024-NEXT: ret
429
401
%op1 = load <32 x float >, <32 x float >* %a
430
402
%res = fptrunc <32 x float > %op1 to <32 x half >
@@ -437,11 +409,8 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
437
409
; VBITS_GE_2048: // %bb.0:
438
410
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
439
411
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
440
- ; VBITS_GE_2048-NEXT: ptrue p0.s
441
412
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
442
- ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
443
- ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
444
- ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
413
+ ; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
445
414
; VBITS_GE_2048-NEXT: ret
446
415
%op1 = load <64 x float >, <64 x float >* %a
447
416
%res = fptrunc <64 x float > %op1 to <64 x half >
@@ -533,12 +502,8 @@ define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
533
502
; VBITS_GE_1024: // %bb.0:
534
503
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
535
504
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
536
- ; VBITS_GE_1024-NEXT: ptrue p0.d
537
505
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
538
- ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
539
- ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
540
- ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
541
- ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
506
+ ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
542
507
; VBITS_GE_1024-NEXT: ret
543
508
%op1 = load <16 x double >, <16 x double >* %a
544
509
%res = fptrunc <16 x double > %op1 to <16 x half >
@@ -551,12 +516,8 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
551
516
; VBITS_GE_2048: // %bb.0:
552
517
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
553
518
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
554
- ; VBITS_GE_2048-NEXT: ptrue p0.d
555
519
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
556
- ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
557
- ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
558
- ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
559
- ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
520
+ ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
560
521
; VBITS_GE_2048-NEXT: ret
561
522
%op1 = load <32 x double >, <32 x double >* %a
562
523
%res = fptrunc <32 x double > %op1 to <32 x half >
@@ -627,11 +588,8 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
627
588
; VBITS_GE_512: // %bb.0:
628
589
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
629
590
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
630
- ; VBITS_GE_512-NEXT: ptrue p0.d
631
591
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d
632
- ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
633
- ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
634
- ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
592
+ ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
635
593
; VBITS_GE_512-NEXT: ret
636
594
%op1 = load <8 x double >, <8 x double >* %a
637
595
%res = fptrunc <8 x double > %op1 to <8 x float >
@@ -644,11 +602,8 @@ define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
644
602
; VBITS_GE_1024: // %bb.0:
645
603
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
646
604
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
647
- ; VBITS_GE_1024-NEXT: ptrue p0.d
648
605
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
649
- ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
650
- ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
651
- ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
606
+ ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
652
607
; VBITS_GE_1024-NEXT: ret
653
608
%op1 = load <16 x double >, <16 x double >* %a
654
609
%res = fptrunc <16 x double > %op1 to <16 x float >
@@ -661,11 +616,8 @@ define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
661
616
; VBITS_GE_2048: // %bb.0:
662
617
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
663
618
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
664
- ; VBITS_GE_2048-NEXT: ptrue p0.d
665
619
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
666
- ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
667
- ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
668
- ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
620
+ ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
669
621
; VBITS_GE_2048-NEXT: ret
670
622
%op1 = load <32 x double >, <32 x double >* %a
671
623
%res = fptrunc <32 x double > %op1 to <32 x float >
0 commit comments