Skip to content

Commit 747c574

Browse files
committed
[ARM] Extra MVE VMLAV reduction patterns
These patterns for i8 and i16 VMLA's were missing. They end up from legalized vector.reduce.add.v8i16 and vector.reduce.add.v16i8, and although the instruction works differently (the mul and add are performed in a higher precision), I believe it is OK because only an i8/i16 are demanded from them, and so the results will be the same. At least, they pass any testing I can think to run on them. There are some tests that end up looking worse, but are quite artificial due to passing half vector types through a call boundary. I would not expect the vmull to realistically come up like that, and a vmlava is likely better a lot of the time. Differential Revision: https://reviews.llvm.org/D80524
1 parent 34cfed2 commit 747c574

File tree

2 files changed

+30
-20
lines changed

2 files changed

+30
-20
lines changed

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,22 +1019,32 @@ def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
10191019
let Predicates = [HasMVEInt] in {
10201020
def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
10211021
(i32 (MVE_VMLADAVu32 $src1, $src2))>;
1022-
def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
1023-
(i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
1024-
def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
1025-
(i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
1022+
def : Pat<(i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
1023+
(i32 (MVE_VMLADAVu16 $src1, $src2))>;
10261024
def : Pat<(i32 (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
10271025
(i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
10281026
def : Pat<(i32 (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
10291027
(i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
1028+
def : Pat<(i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
1029+
(i32 (MVE_VMLADAVu8 $src1, $src2))>;
1030+
def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
1031+
(i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
1032+
def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
1033+
(i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
10301034

10311035
def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
10321036
(i32 tGPREven:$src3))),
10331037
(i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>;
1038+
def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
1039+
(i32 tGPREven:$src3))),
1040+
(i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>;
10341041
def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
10351042
(i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
10361043
def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
10371044
(i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
1045+
def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
1046+
(i32 tGPREven:$src3))),
1047+
(i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>;
10381048
def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
10391049
(i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
10401050
def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),

llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,7 @@ entry:
135135
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
136136
; CHECK-LABEL: add_v8i16_v8i16:
137137
; CHECK: @ %bb.0: @ %entry
138-
; CHECK-NEXT: vmul.i16 q0, q0, q1
139-
; CHECK-NEXT: vaddv.u16 r0, q0
138+
; CHECK-NEXT: vmlav.u16 r0, q0, q1
140139
; CHECK-NEXT: uxth r0, r0
141140
; CHECK-NEXT: bx lr
142141
entry:
@@ -438,8 +437,9 @@ entry:
438437
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
439438
; CHECK-LABEL: add_v8i8_v8i16_zext:
440439
; CHECK: @ %bb.0: @ %entry
441-
; CHECK-NEXT: vmullb.u8 q0, q0, q1
442-
; CHECK-NEXT: vaddv.u16 r0, q0
440+
; CHECK-NEXT: vmovlb.u8 q1, q1
441+
; CHECK-NEXT: vmovlb.u8 q0, q0
442+
; CHECK-NEXT: vmlav.u16 r0, q0, q1
443443
; CHECK-NEXT: uxth r0, r0
444444
; CHECK-NEXT: bx lr
445445
entry:
@@ -453,8 +453,9 @@ entry:
453453
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
454454
; CHECK-LABEL: add_v8i8_v8i16_sext:
455455
; CHECK: @ %bb.0: @ %entry
456-
; CHECK-NEXT: vmullb.s8 q0, q0, q1
457-
; CHECK-NEXT: vaddv.u16 r0, q0
456+
; CHECK-NEXT: vmovlb.s8 q1, q1
457+
; CHECK-NEXT: vmovlb.s8 q0, q0
458+
; CHECK-NEXT: vmlav.u16 r0, q0, q1
458459
; CHECK-NEXT: sxth r0, r0
459460
; CHECK-NEXT: bx lr
460461
entry:
@@ -468,8 +469,7 @@ entry:
468469
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
469470
; CHECK-LABEL: add_v16i8_v16i8:
470471
; CHECK: @ %bb.0: @ %entry
471-
; CHECK-NEXT: vmul.i8 q0, q0, q1
472-
; CHECK-NEXT: vaddv.u8 r0, q0
472+
; CHECK-NEXT: vmlav.u8 r0, q0, q1
473473
; CHECK-NEXT: uxtb r0, r0
474474
; CHECK-NEXT: bx lr
475475
entry:
@@ -1086,8 +1086,7 @@ entry:
10861086
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
10871087
; CHECK-LABEL: add_v8i16_v8i16_acc:
10881088
; CHECK: @ %bb.0: @ %entry
1089-
; CHECK-NEXT: vmul.i16 q0, q0, q1
1090-
; CHECK-NEXT: vaddva.u16 r0, q0
1089+
; CHECK-NEXT: vmlava.u16 r0, q0, q1
10911090
; CHECK-NEXT: uxth r0, r0
10921091
; CHECK-NEXT: bx lr
10931092
entry:
@@ -1408,8 +1407,9 @@ entry:
14081407
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
14091408
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
14101409
; CHECK: @ %bb.0: @ %entry
1411-
; CHECK-NEXT: vmullb.u8 q0, q0, q1
1412-
; CHECK-NEXT: vaddva.u16 r0, q0
1410+
; CHECK-NEXT: vmovlb.u8 q1, q1
1411+
; CHECK-NEXT: vmovlb.u8 q0, q0
1412+
; CHECK-NEXT: vmlava.u16 r0, q0, q1
14131413
; CHECK-NEXT: uxth r0, r0
14141414
; CHECK-NEXT: bx lr
14151415
entry:
@@ -1424,8 +1424,9 @@ entry:
14241424
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
14251425
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
14261426
; CHECK: @ %bb.0: @ %entry
1427-
; CHECK-NEXT: vmullb.s8 q0, q0, q1
1428-
; CHECK-NEXT: vaddva.u16 r0, q0
1427+
; CHECK-NEXT: vmovlb.s8 q1, q1
1428+
; CHECK-NEXT: vmovlb.s8 q0, q0
1429+
; CHECK-NEXT: vmlava.u16 r0, q0, q1
14291430
; CHECK-NEXT: sxth r0, r0
14301431
; CHECK-NEXT: bx lr
14311432
entry:
@@ -1440,8 +1441,7 @@ entry:
14401441
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
14411442
; CHECK-LABEL: add_v16i8_v16i8_acc:
14421443
; CHECK: @ %bb.0: @ %entry
1443-
; CHECK-NEXT: vmul.i8 q0, q0, q1
1444-
; CHECK-NEXT: vaddva.u8 r0, q0
1444+
; CHECK-NEXT: vmlava.u8 r0, q0, q1
14451445
; CHECK-NEXT: uxtb r0, r0
14461446
; CHECK-NEXT: bx lr
14471447
entry:

0 commit comments

Comments
 (0)