@@ -61,6 +61,16 @@ define i8 @uaddv_v64i8(<64 x i8>* %a) #0 {
61
61
; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
62
62
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
63
63
; VBITS_GE_512-NEXT: ret
64
+
65
+ ; Ensure sensible type legalisation.
66
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
67
+ ; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
68
+ ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
69
+ ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
70
+ ; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
71
+ ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].b
72
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
73
+ ; VBITS_EQ_256-NEXT: ret
64
74
%op = load <64 x i8 >, <64 x i8 >* %a
65
75
%res = call i8 @llvm.experimental.vector.reduce.add.v64i8 (<64 x i8 > %op )
66
76
ret i8 %res
@@ -127,6 +137,16 @@ define i16 @uaddv_v32i16(<32 x i16>* %a) #0 {
127
137
; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
128
138
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
129
139
; VBITS_GE_512-NEXT: ret
140
+
141
+ ; Ensure sensible type legalisation.
142
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
143
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
144
+ ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
145
+ ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
146
+ ; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[PG]]/m, [[LO]].h, [[HI]].h
147
+ ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].h
148
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
149
+ ; VBITS_EQ_256-NEXT: ret
130
150
%op = load <32 x i16 >, <32 x i16 >* %a
131
151
%res = call i16 @llvm.experimental.vector.reduce.add.v32i16 (<32 x i16 > %op )
132
152
ret i16 %res
@@ -193,6 +213,16 @@ define i32 @uaddv_v16i32(<16 x i32>* %a) #0 {
193
213
; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
194
214
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
195
215
; VBITS_GE_512-NEXT: ret
216
+
217
+ ; Ensure sensible type legalisation.
218
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
219
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
220
+ ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
221
+ ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
222
+ ; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[PG]]/m, [[LO]].s, [[HI]].s
223
+ ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].s
224
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
225
+ ; VBITS_EQ_256-NEXT: ret
196
226
%op = load <16 x i32 >, <16 x i32 >* %a
197
227
%res = call i32 @llvm.experimental.vector.reduce.add.v16i32 (<16 x i32 > %op )
198
228
ret i32 %res
@@ -259,6 +289,16 @@ define i64 @uaddv_v8i64(<8 x i64>* %a) #0 {
259
289
; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
260
290
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
261
291
; VBITS_GE_512-NEXT: ret
292
+
293
+ ; Ensure sensible type legalisation.
294
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
295
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
296
+ ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
297
+ ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
298
+ ; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[PG]]/m, [[LO]].d, [[HI]].d
299
+ ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].d
300
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
301
+ ; VBITS_EQ_256-NEXT: ret
262
302
%op = load <8 x i64 >, <8 x i64 >* %a
263
303
%res = call i64 @llvm.experimental.vector.reduce.add.v8i64 (<8 x i64 > %op )
264
304
ret i64 %res
@@ -329,6 +369,16 @@ define i8 @smaxv_v64i8(<64 x i8>* %a) #0 {
329
369
; VBITS_GE_512-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
330
370
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
331
371
; VBITS_GE_512-NEXT: ret
372
+
373
+ ; Ensure sensible type legalisation.
374
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
375
+ ; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
376
+ ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
377
+ ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
378
+ ; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
379
+ ; VBITS_EQ_256-DAG: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
380
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
381
+ ; VBITS_EQ_256-NEXT: ret
332
382
%op = load <64 x i8 >, <64 x i8 >* %a
333
383
%res = call i8 @llvm.experimental.vector.reduce.smax.v64i8 (<64 x i8 > %op )
334
384
ret i8 %res
@@ -395,6 +445,16 @@ define i16 @smaxv_v32i16(<32 x i16>* %a) #0 {
395
445
; VBITS_GE_512-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
396
446
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
397
447
; VBITS_GE_512-NEXT: ret
448
+
449
+ ; Ensure sensible type legalisation.
450
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
451
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
452
+ ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
453
+ ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
454
+ ; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].h, [[PG]]/m, [[LO]].h, [[HI]].h
455
+ ; VBITS_EQ_256-DAG: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
456
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
457
+ ; VBITS_EQ_256-NEXT: ret
398
458
%op = load <32 x i16 >, <32 x i16 >* %a
399
459
%res = call i16 @llvm.experimental.vector.reduce.smax.v32i16 (<32 x i16 > %op )
400
460
ret i16 %res
@@ -461,6 +521,16 @@ define i32 @smaxv_v16i32(<16 x i32>* %a) #0 {
461
521
; VBITS_GE_512-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
462
522
; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
463
523
; VBITS_GE_512-NEXT: ret
524
+
525
+ ; Ensure sensible type legalisation.
526
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
527
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
528
+ ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
529
+ ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
530
+ ; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].s, [[PG]]/m, [[LO]].s, [[HI]].s
531
+ ; VBITS_EQ_256-DAG: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
532
+ ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
533
+ ; VBITS_EQ_256-NEXT: ret
464
534
%op = load <16 x i32 >, <16 x i32 >* %a
465
535
%res = call i32 @llvm.experimental.vector.reduce.smax.v16i32 (<16 x i32 > %op )
466
536
ret i32 %res
@@ -529,6 +599,16 @@ define i64 @smaxv_v8i64(<8 x i64>* %a) #0 {
529
599
; VBITS_GE_512-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
530
600
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
531
601
; VBITS_GE_512-NEXT: ret
602
+
603
+ ; Ensure sensible type legalisation.
604
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
605
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
606
+ ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
607
+ ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
608
+ ; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].d, [[PG]]/m, [[LO]].d, [[HI]].d
609
+ ; VBITS_EQ_256-DAG: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
610
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
611
+ ; VBITS_EQ_256-NEXT: ret
532
612
%op = load <8 x i64 >, <8 x i64 >* %a
533
613
%res = call i64 @llvm.experimental.vector.reduce.smax.v8i64 (<8 x i64 > %op )
534
614
ret i64 %res
@@ -599,6 +679,16 @@ define i8 @sminv_v64i8(<64 x i8>* %a) #0 {
599
679
; VBITS_GE_512-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
600
680
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
601
681
; VBITS_GE_512-NEXT: ret
682
+
683
+ ; Ensure sensible type legalisation.
684
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
685
+ ; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
686
+ ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
687
+ ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
688
+ ; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
689
+ ; VBITS_EQ_256-DAG: sminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
690
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
691
+ ; VBITS_EQ_256-NEXT: ret
602
692
%op = load <64 x i8 >, <64 x i8 >* %a
603
693
%res = call i8 @llvm.experimental.vector.reduce.smin.v64i8 (<64 x i8 > %op )
604
694
ret i8 %res
@@ -665,6 +755,16 @@ define i16 @sminv_v32i16(<32 x i16>* %a) #0 {
665
755
; VBITS_GE_512-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
666
756
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
667
757
; VBITS_GE_512-NEXT: ret
758
+
759
+ ; Ensure sensible type legalisation.
760
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
761
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
762
+ ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
763
+ ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
764
+ ; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].h, [[PG]]/m, [[LO]].h, [[HI]].h
765
+ ; VBITS_EQ_256-DAG: sminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
766
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
767
+ ; VBITS_EQ_256-NEXT: ret
668
768
%op = load <32 x i16 >, <32 x i16 >* %a
669
769
%res = call i16 @llvm.experimental.vector.reduce.smin.v32i16 (<32 x i16 > %op )
670
770
ret i16 %res
@@ -731,6 +831,16 @@ define i32 @sminv_v16i32(<16 x i32>* %a) #0 {
731
831
; VBITS_GE_512-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
732
832
; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
733
833
; VBITS_GE_512-NEXT: ret
834
+
835
+ ; Ensure sensible type legalisation.
836
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
837
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
838
+ ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
839
+ ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
840
+ ; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].s, [[PG]]/m, [[LO]].s, [[HI]].s
841
+ ; VBITS_EQ_256-DAG: sminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
842
+ ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
843
+ ; VBITS_EQ_256-NEXT: ret
734
844
%op = load <16 x i32 >, <16 x i32 >* %a
735
845
%res = call i32 @llvm.experimental.vector.reduce.smin.v16i32 (<16 x i32 > %op )
736
846
ret i32 %res
@@ -799,6 +909,16 @@ define i64 @sminv_v8i64(<8 x i64>* %a) #0 {
799
909
; VBITS_GE_512-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
800
910
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
801
911
; VBITS_GE_512-NEXT: ret
912
+
913
+ ; Ensure sensible type legalisation.
914
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
915
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
916
+ ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
917
+ ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
918
+ ; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].d, [[PG]]/m, [[LO]].d, [[HI]].d
919
+ ; VBITS_EQ_256-DAG: sminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
920
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
921
+ ; VBITS_EQ_256-NEXT: ret
802
922
%op = load <8 x i64 >, <8 x i64 >* %a
803
923
%res = call i64 @llvm.experimental.vector.reduce.smin.v8i64 (<8 x i64 > %op )
804
924
ret i64 %res
@@ -869,6 +989,16 @@ define i8 @umaxv_v64i8(<64 x i8>* %a) #0 {
869
989
; VBITS_GE_512-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
870
990
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
871
991
; VBITS_GE_512-NEXT: ret
992
+
993
+ ; Ensure sensible type legalisation.
994
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
995
+ ; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
996
+ ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
997
+ ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
998
+ ; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
999
+ ; VBITS_EQ_256-DAG: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
1000
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1001
+ ; VBITS_EQ_256-NEXT: ret
872
1002
%op = load <64 x i8 >, <64 x i8 >* %a
873
1003
%res = call i8 @llvm.experimental.vector.reduce.umax.v64i8 (<64 x i8 > %op )
874
1004
ret i8 %res
@@ -935,6 +1065,16 @@ define i16 @umaxv_v32i16(<32 x i16>* %a) #0 {
935
1065
; VBITS_GE_512-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
936
1066
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
937
1067
; VBITS_GE_512-NEXT: ret
1068
+
1069
+ ; Ensure sensible type legalisation.
1070
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
1071
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
1072
+ ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
1073
+ ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
1074
+ ; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].h, [[PG]]/m, [[LO]].h, [[HI]].h
1075
+ ; VBITS_EQ_256-DAG: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
1076
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1077
+ ; VBITS_EQ_256-NEXT: ret
938
1078
%op = load <32 x i16 >, <32 x i16 >* %a
939
1079
%res = call i16 @llvm.experimental.vector.reduce.umax.v32i16 (<32 x i16 > %op )
940
1080
ret i16 %res
@@ -1001,6 +1141,16 @@ define i32 @umaxv_v16i32(<16 x i32>* %a) #0 {
1001
1141
; VBITS_GE_512-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1002
1142
; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
1003
1143
; VBITS_GE_512-NEXT: ret
1144
+
1145
+ ; Ensure sensible type legalisation.
1146
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
1147
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
1148
+ ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
1149
+ ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
1150
+ ; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].s, [[PG]]/m, [[LO]].s, [[HI]].s
1151
+ ; VBITS_EQ_256-DAG: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
1152
+ ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
1153
+ ; VBITS_EQ_256-NEXT: ret
1004
1154
%op = load <16 x i32 >, <16 x i32 >* %a
1005
1155
%res = call i32 @llvm.experimental.vector.reduce.umax.v16i32 (<16 x i32 > %op )
1006
1156
ret i32 %res
@@ -1069,6 +1219,16 @@ define i64 @umaxv_v8i64(<8 x i64>* %a) #0 {
1069
1219
; VBITS_GE_512-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1070
1220
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
1071
1221
; VBITS_GE_512-NEXT: ret
1222
+
1223
+ ; Ensure sensible type legalisation.
1224
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
1225
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
1226
+ ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
1227
+ ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
1228
+ ; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].d, [[PG]]/m, [[LO]].d, [[HI]].d
1229
+ ; VBITS_EQ_256-DAG: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
1230
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
1231
+ ; VBITS_EQ_256-NEXT: ret
1072
1232
%op = load <8 x i64 >, <8 x i64 >* %a
1073
1233
%res = call i64 @llvm.experimental.vector.reduce.umax.v8i64 (<8 x i64 > %op )
1074
1234
ret i64 %res
@@ -1139,6 +1299,16 @@ define i8 @uminv_v64i8(<64 x i8>* %a) #0 {
1139
1299
; VBITS_GE_512-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1140
1300
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
1141
1301
; VBITS_GE_512-NEXT: ret
1302
+
1303
+ ; Ensure sensible type legalisation.
1304
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
1305
+ ; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
1306
+ ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
1307
+ ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
1308
+ ; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
1309
+ ; VBITS_EQ_256-DAG: uminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
1310
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1311
+ ; VBITS_EQ_256-NEXT: ret
1142
1312
%op = load <64 x i8 >, <64 x i8 >* %a
1143
1313
%res = call i8 @llvm.experimental.vector.reduce.umin.v64i8 (<64 x i8 > %op )
1144
1314
ret i8 %res
@@ -1205,6 +1375,16 @@ define i16 @uminv_v32i16(<32 x i16>* %a) #0 {
1205
1375
; VBITS_GE_512-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1206
1376
; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
1207
1377
; VBITS_GE_512-NEXT: ret
1378
+
1379
+ ; Ensure sensible type legalisation.
1380
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
1381
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
1382
+ ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
1383
+ ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
1384
+ ; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].h, [[PG]]/m, [[LO]].h, [[HI]].h
1385
+ ; VBITS_EQ_256-DAG: uminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
1386
+ ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1387
+ ; VBITS_EQ_256-NEXT: ret
1208
1388
%op = load <32 x i16 >, <32 x i16 >* %a
1209
1389
%res = call i16 @llvm.experimental.vector.reduce.umin.v32i16 (<32 x i16 > %op )
1210
1390
ret i16 %res
@@ -1271,6 +1451,16 @@ define i32 @uminv_v16i32(<16 x i32>* %a) #0 {
1271
1451
; VBITS_GE_512-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1272
1452
; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
1273
1453
; VBITS_GE_512-NEXT: ret
1454
+
1455
+ ; Ensure sensible type legalisation.
1456
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
1457
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
1458
+ ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
1459
+ ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
1460
+ ; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].s, [[PG]]/m, [[LO]].s, [[HI]].s
1461
+ ; VBITS_EQ_256-DAG: uminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
1462
+ ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
1463
+ ; VBITS_EQ_256-NEXT: ret
1274
1464
%op = load <16 x i32 >, <16 x i32 >* %a
1275
1465
%res = call i32 @llvm.experimental.vector.reduce.umin.v16i32 (<16 x i32 > %op )
1276
1466
ret i32 %res
@@ -1339,6 +1529,16 @@ define i64 @uminv_v8i64(<8 x i64>* %a) #0 {
1339
1529
; VBITS_GE_512-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1340
1530
; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
1341
1531
; VBITS_GE_512-NEXT: ret
1532
+
1533
+ ; Ensure sensible type legalisation.
1534
+ ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
1535
+ ; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
1536
+ ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
1537
+ ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
1538
+ ; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].d, [[PG]]/m, [[LO]].d, [[HI]].d
1539
+ ; VBITS_EQ_256-DAG: uminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
1540
+ ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
1541
+ ; VBITS_EQ_256-NEXT: ret
1342
1542
%op = load <8 x i64 >, <8 x i64 >* %a
1343
1543
%res = call i64 @llvm.experimental.vector.reduce.umin.v8i64 (<8 x i64 > %op )
1344
1544
ret i64 %res
0 commit comments