Skip to content

Commit 17fe9bd

Browse files
[DAGCombiner] Add generic DAG combine for ISD::PARTIAL_REDUCE_MLA
Add generic DAG combine for ISD::PARTIAL_REDUCE_U/SMLA nodes. Transforms the DAG from: PARTIAL_REDUCE_MLA(Acc, MUL(EXT(MulOpLHS), EXT(MulOpRHS)), Splat(1)) to PARTIAL_REDUCE_MLA(Acc, MulOpLHS, MulOpRHS).
1 parent 638c0ca commit 17fe9bd

File tree

3 files changed

+138
-85
lines changed

3 files changed

+138
-85
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,7 @@ namespace {
545545
SDValue visitMGATHER(SDNode *N);
546546
SDValue visitMSCATTER(SDNode *N);
547547
SDValue visitMHISTOGRAM(SDNode *N);
548+
SDValue visitPARTIAL_REDUCE_MLA(SDNode *N);
548549
SDValue visitVPGATHER(SDNode *N);
549550
SDValue visitVPSCATTER(SDNode *N);
550551
SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1972,6 +1973,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
19721973
case ISD::MSCATTER: return visitMSCATTER(N);
19731974
case ISD::MSTORE: return visitMSTORE(N);
19741975
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
1976+
case ISD::PARTIAL_REDUCE_SMLA:
1977+
case ISD::PARTIAL_REDUCE_UMLA:
1978+
return visitPARTIAL_REDUCE_MLA(N);
19751979
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
19761980
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
19771981
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
@@ -12497,6 +12501,50 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
1249712501
return SDValue();
1249812502
}
1249912503

12504+
SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
12505+
// Makes PARTIAL_REDUCE_MLA(Acc, MUL(EXT(MulOpLHS), EXT(MulOpRHS)), Splat(1))
12506+
// into PARTIAL_REDUCE_MLA(Acc, MulOpLHS, MulOpRHS)
12507+
SDLoc DL(N);
12508+
SDValue Op0 = N->getOperand(0);
12509+
SDValue Op1 = N->getOperand(1);
12510+
SDValue Op2 = N->getOperand(2);
12511+
12512+
if (Op1->getOpcode() != ISD::MUL)
12513+
return SDValue();
12514+
12515+
SDValue ExtMulOpLHS = Op1->getOperand(0);
12516+
SDValue ExtMulOpRHS = Op1->getOperand(1);
12517+
unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
12518+
unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
12519+
if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
12520+
!ISD::isExtOpcode(ExtMulOpRHSOpcode))
12521+
return SDValue();
12522+
12523+
SDValue MulOpLHS = ExtMulOpLHS->getOperand(0);
12524+
SDValue MulOpRHS = ExtMulOpRHS->getOperand(0);
12525+
EVT MulOpLHSVT = MulOpLHS.getValueType();
12526+
if (MulOpLHSVT != MulOpRHS.getValueType())
12527+
return SDValue();
12528+
12529+
if (!TLI.isTypeLegal(MulOpLHSVT) || !TLI.isTypeLegal(N->getValueType(0)))
12530+
return SDValue();
12531+
12532+
APInt ConstantOne;
12533+
if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
12534+
!ConstantOne.isOne())
12535+
return SDValue();
12536+
12537+
bool LHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
12538+
bool RHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
12539+
if (LHSIsSigned != RHSIsSigned)
12540+
return SDValue();
12541+
12542+
unsigned NewOpcode =
12543+
LHSIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
12544+
return DAG.getNode(NewOpcode, DL, Op0->getValueType(0), Op0, MulOpLHS,
12545+
MulOpRHS);
12546+
}
12547+
1250012548
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
1250112549
auto *SLD = cast<VPStridedLoadSDNode>(N);
1250212550
EVT EltVT = SLD->getValueType(0).getVectorElementType();

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
1212
;
1313
; CHECK-NODOT-LABEL: udot:
1414
; CHECK-NODOT: // %bb.0:
15-
; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16-
; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17-
; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18-
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19-
; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20-
; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21-
; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
15+
; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16+
; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17+
; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18+
; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19+
; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20+
; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21+
; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22+
; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23+
; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
2224
; CHECK-NODOT-NEXT: ret
2325
%u.wide = zext <16 x i8> %u to <16 x i32>
2426
%s.wide = zext <16 x i8> %s to <16 x i32>
@@ -35,17 +37,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
3537
;
3638
; CHECK-NODOT-LABEL: udot_narrow:
3739
; CHECK-NODOT: // %bb.0:
38-
; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
40+
; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
41+
; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
3942
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
40-
; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
41-
; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
42-
; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
43-
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
43+
; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
44+
; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
45+
; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
46+
; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
47+
; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
4448
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
45-
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
46-
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
47-
; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
49+
; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
50+
; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
4851
; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
52+
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
4953
; CHECK-NODOT-NEXT: ret
5054
%u.wide = zext <8 x i8> %u to <8 x i32>
5155
%s.wide = zext <8 x i8> %s to <8 x i32>
@@ -62,13 +66,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
6266
;
6367
; CHECK-NODOT-LABEL: sdot:
6468
; CHECK-NODOT: // %bb.0:
65-
; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
66-
; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
67-
; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
68-
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
69-
; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
70-
; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
71-
; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
69+
; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
70+
; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
71+
; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
72+
; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
73+
; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
74+
; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
75+
; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
76+
; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
77+
; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
7278
; CHECK-NODOT-NEXT: ret
7379
%u.wide = sext <16 x i8> %u to <16 x i32>
7480
%s.wide = sext <16 x i8> %s to <16 x i32>
@@ -85,17 +91,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
8591
;
8692
; CHECK-NODOT-LABEL: sdot_narrow:
8793
; CHECK-NODOT: // %bb.0:
88-
; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
94+
; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
95+
; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
8996
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
90-
; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
91-
; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
92-
; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
93-
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
97+
; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
98+
; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
99+
; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
100+
; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
101+
; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
94102
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
95-
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
96-
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
97-
; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
103+
; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
104+
; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
98105
; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
106+
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
99107
; CHECK-NODOT-NEXT: ret
100108
%u.wide = sext <8 x i8> %u to <8 x i32>
101109
%s.wide = sext <8 x i8> %s to <8 x i32>
@@ -531,9 +539,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
531539
define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
532540
; CHECK-LABEL: not_udot:
533541
; CHECK: // %bb.0:
534-
; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
535-
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
536-
; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
542+
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
543+
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
544+
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
545+
; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
537546
; CHECK-NEXT: ret
538547
%u.wide = zext <8 x i8> %u to <8 x i32>
539548
%s.wide = zext <8 x i8> %s to <8 x i32>

llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Lines changed: 48 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,23 @@ define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
1111
;
1212
; CHECK-NEWLOWERING-LABEL: udot:
1313
; CHECK-NEWLOWERING: // %bb.0: // %entry
14-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
15-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
16-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
14+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z2.b
15+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z1.b
1716
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
17+
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
1818
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
1919
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
20-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
2120
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
21+
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
2222
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
23-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
24-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
25-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
26-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
27-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
28-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
29-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
30-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
31-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
23+
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
24+
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h
25+
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.s, z1.h
26+
; CHECK-NEWLOWERING-NEXT: mul z3.s, z4.s, z3.s
27+
; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
28+
; CHECK-NEWLOWERING-NEXT: uunpklo z1.s, z1.h
29+
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
30+
; CHECK-NEWLOWERING-NEXT: mad z1.s, p0/m, z2.s, z3.s
3231
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
3332
; CHECK-NEWLOWERING-NEXT: ret
3433
entry:
@@ -47,24 +46,23 @@ define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
4746
;
4847
; CHECK-NEWLOWERING-LABEL: udot_wide:
4948
; CHECK-NEWLOWERING: // %bb.0: // %entry
50-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
51-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
52-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
49+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
50+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
5351
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
52+
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
5453
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
5554
; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s
56-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
5755
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
56+
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
5857
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
59-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s
60-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
61-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s
62-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
63-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
64-
; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
65-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
66-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
67-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
58+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
59+
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z2.s
60+
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z1.s
61+
; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d
62+
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
63+
; CHECK-NEWLOWERING-NEXT: uunpklo z1.d, z1.s
64+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
65+
; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d
6866
; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
6967
; CHECK-NEWLOWERING-NEXT: ret
7068
entry:
@@ -83,24 +81,23 @@ define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a,
8381
;
8482
; CHECK-NEWLOWERING-LABEL: sdot:
8583
; CHECK-NEWLOWERING: // %bb.0: // %entry
86-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
87-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
88-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
84+
; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z2.b
85+
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z1.b
8986
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
87+
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
9088
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
9189
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
92-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
9390
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
91+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
9492
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
95-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
96-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
97-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
98-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
99-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
100-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
101-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
102-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
103-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
93+
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
94+
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h
95+
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.s, z1.h
96+
; CHECK-NEWLOWERING-NEXT: mul z3.s, z4.s, z3.s
97+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
98+
; CHECK-NEWLOWERING-NEXT: sunpklo z1.s, z1.h
99+
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
100+
; CHECK-NEWLOWERING-NEXT: mad z1.s, p0/m, z2.s, z3.s
104101
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
105102
; CHECK-NEWLOWERING-NEXT: ret
106103
entry:
@@ -119,24 +116,23 @@ define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
119116
;
120117
; CHECK-NEWLOWERING-LABEL: sdot_wide:
121118
; CHECK-NEWLOWERING: // %bb.0: // %entry
122-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h
123-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h
124-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
119+
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z2.h
120+
; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z1.h
125121
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
122+
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
126123
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
127124
; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s
128-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
129125
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
126+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
130127
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
131-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s
132-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
133-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s
134-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
135-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
136-
; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
137-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
138-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
139-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
128+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
129+
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z2.s
130+
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z1.s
131+
; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d
132+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
133+
; CHECK-NEWLOWERING-NEXT: sunpklo z1.d, z1.s
134+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
135+
; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d
140136
; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
141137
; CHECK-NEWLOWERING-NEXT: ret
142138
entry:

0 commit comments

Comments
 (0)