Skip to content

Commit 8fbc143

Browse files
committed
[AArch64] Merge [US]MULL with half adds and subs into [US]ML[AS]L
This patch adds patterns to teach the AArch64 backend to merge [US]MULL instructions and adds/subs of half the size into [US]ML[AS]L where we don't use the top half of the result. Differential Revision: https://reviews.llvm.org/D95218
1 parent 6884fbc commit 8fbc143

File tree

2 files changed

+243
-0
lines changed

2 files changed

+243
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4792,6 +4792,44 @@ defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
47924792
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
47934793
BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>;
47944794

4795+
// Additional patterns for [SU]ML[AS]L
4796+
multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperator vecopnode,
4797+
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
4798+
def : Pat<(v4i16 (opnode
4799+
V64:$Ra,
4800+
(v4i16 (extract_subvector
4801+
(vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)),
4802+
(i64 0))))),
4803+
(EXTRACT_SUBREG (v8i16 (INST8B
4804+
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub),
4805+
V64:$Rn, V64:$Rm)), dsub)>;
4806+
def : Pat<(v2i32 (opnode
4807+
V64:$Ra,
4808+
(v2i32 (extract_subvector
4809+
(vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)),
4810+
(i64 0))))),
4811+
(EXTRACT_SUBREG (v4i32 (INST4H
4812+
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub),
4813+
V64:$Rn, V64:$Rm)), dsub)>;
4814+
def : Pat<(v1i64 (opnode
4815+
V64:$Ra,
4816+
(v1i64 (extract_subvector
4817+
(vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)),
4818+
(i64 0))))),
4819+
(EXTRACT_SUBREG (v2i64 (INST2S
4820+
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub),
4821+
V64:$Rn, V64:$Rm)), dsub)>;
4822+
}
4823+
4824+
defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
4825+
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
4826+
defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
4827+
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
4828+
defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
4829+
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
4830+
defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
4831+
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
4832+
47954833
// Additional patterns for SMULL and UMULL
47964834
multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
47974835
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
3+
4+
define <4 x i16> @test_mla0(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
5+
; CHECK-LABEL: test_mla0:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: umull v2.8h, v2.8b, v3.8b
8+
; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
9+
; CHECK-NEXT: mov v0.16b, v2.16b
10+
; CHECK-NEXT: ret
11+
entry:
12+
%vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
13+
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %c, <8 x i8> %d)
14+
%add.i = add <8 x i16> %vmull.i.i, %vmull.i
15+
%shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
16+
ret <4 x i16> %shuffle.i
17+
}
18+
19+
20+
define <4 x i16> @test_mla1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
21+
; CHECK-LABEL: test_mla1:
22+
; CHECK: // %bb.0: // %entry
23+
; CHECK-NEXT: smull v2.8h, v2.8b, v3.8b
24+
; CHECK-NEXT: smlal v2.8h, v0.8b, v1.8b
25+
; CHECK-NEXT: mov v0.16b, v2.16b
26+
; CHECK-NEXT: ret
27+
entry:
28+
%vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
29+
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %c, <8 x i8> %d)
30+
%add.i = add <8 x i16> %vmull.i.i, %vmull.i
31+
%shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
32+
ret <4 x i16> %shuffle.i
33+
}
34+
35+
36+
define <2 x i32> @test_mla2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
37+
; CHECK-LABEL: test_mla2:
38+
; CHECK: // %bb.0: // %entry
39+
; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h
40+
; CHECK-NEXT: umlal v2.4s, v0.4h, v1.4h
41+
; CHECK-NEXT: mov v0.16b, v2.16b
42+
; CHECK-NEXT: ret
43+
entry:
44+
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
45+
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %c, <4 x i16> %d)
46+
%add.i = add <4 x i32> %vmull2.i.i, %vmull2.i
47+
%shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
48+
ret <2 x i32> %shuffle.i
49+
}
50+
51+
52+
define <2 x i32> @test_mla3(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
53+
; CHECK-LABEL: test_mla3:
54+
; CHECK: // %bb.0: // %entry
55+
; CHECK-NEXT: smull v2.4s, v2.4h, v3.4h
56+
; CHECK-NEXT: smlal v2.4s, v0.4h, v1.4h
57+
; CHECK-NEXT: mov v0.16b, v2.16b
58+
; CHECK-NEXT: ret
59+
entry:
60+
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
61+
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %c, <4 x i16> %d)
62+
%add.i = add <4 x i32> %vmull2.i.i, %vmull2.i
63+
%shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
64+
ret <2 x i32> %shuffle.i
65+
}
66+
67+
68+
define <1 x i64> @test_mla4(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
69+
; CHECK-LABEL: test_mla4:
70+
; CHECK: // %bb.0: // %entry
71+
; CHECK-NEXT: umull v2.2d, v2.2s, v3.2s
72+
; CHECK-NEXT: umlal v2.2d, v0.2s, v1.2s
73+
; CHECK-NEXT: mov v0.16b, v2.16b
74+
; CHECK-NEXT: ret
75+
entry:
76+
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
77+
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %c, <2 x i32> %d)
78+
%add.i = add <2 x i64> %vmull2.i.i, %vmull2.i
79+
%shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer
80+
ret <1 x i64> %shuffle.i
81+
}
82+
83+
84+
define <1 x i64> @test_mla5(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
85+
; CHECK-LABEL: test_mla5:
86+
; CHECK: // %bb.0: // %entry
87+
; CHECK-NEXT: smull v2.2d, v2.2s, v3.2s
88+
; CHECK-NEXT: smlal v2.2d, v0.2s, v1.2s
89+
; CHECK-NEXT: mov v0.16b, v2.16b
90+
; CHECK-NEXT: ret
91+
entry:
92+
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
93+
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %c, <2 x i32> %d)
94+
%add.i = add <2 x i64> %vmull2.i.i, %vmull2.i
95+
%shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer
96+
ret <1 x i64> %shuffle.i
97+
}
98+
99+
100+
define <4 x i16> @test_mls0(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
101+
; CHECK-LABEL: test_mls0:
102+
; CHECK: // %bb.0: // %entry
103+
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
104+
; CHECK-NEXT: umlsl v0.8h, v2.8b, v3.8b
105+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
106+
; CHECK-NEXT: ret
107+
entry:
108+
%vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
109+
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %c, <8 x i8> %d)
110+
%sub.i = sub <8 x i16> %vmull.i, %vmull.i.i
111+
%shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
112+
ret <4 x i16> %shuffle.i
113+
}
114+
115+
116+
define <4 x i16> @test_mls1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
117+
; CHECK-LABEL: test_mls1:
118+
; CHECK: // %bb.0: // %entry
119+
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
120+
; CHECK-NEXT: smlsl v0.8h, v2.8b, v3.8b
121+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
122+
; CHECK-NEXT: ret
123+
entry:
124+
%vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
125+
%vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %c, <8 x i8> %d)
126+
%sub.i = sub <8 x i16> %vmull.i, %vmull.i.i
127+
%shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
128+
ret <4 x i16> %shuffle.i
129+
}
130+
131+
132+
define <2 x i32> @test_mls2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
133+
; CHECK-LABEL: test_mls2:
134+
; CHECK: // %bb.0: // %entry
135+
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
136+
; CHECK-NEXT: umlsl v0.4s, v2.4h, v3.4h
137+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
138+
; CHECK-NEXT: ret
139+
entry:
140+
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
141+
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %c, <4 x i16> %d)
142+
%sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i
143+
%shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
144+
ret <2 x i32> %shuffle.i
145+
}
146+
147+
148+
define <2 x i32> @test_mls3(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
149+
; CHECK-LABEL: test_mls3:
150+
; CHECK: // %bb.0: // %entry
151+
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
152+
; CHECK-NEXT: smlsl v0.4s, v2.4h, v3.4h
153+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
154+
; CHECK-NEXT: ret
155+
entry:
156+
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
157+
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %c, <4 x i16> %d)
158+
%sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i
159+
%shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
160+
ret <2 x i32> %shuffle.i
161+
}
162+
163+
164+
define <1 x i64> @test_mls4(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
165+
; CHECK-LABEL: test_mls4:
166+
; CHECK: // %bb.0: // %entry
167+
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
168+
; CHECK-NEXT: umlsl v0.2d, v2.2s, v3.2s
169+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
170+
; CHECK-NEXT: ret
171+
entry:
172+
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
173+
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %c, <2 x i32> %d)
174+
%sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i
175+
%shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer
176+
ret <1 x i64> %shuffle.i
177+
}
178+
179+
180+
define <1 x i64> @test_mls5(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
181+
; CHECK-LABEL: test_mls5:
182+
; CHECK: // %bb.0: // %entry
183+
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
184+
; CHECK-NEXT: smlsl v0.2d, v2.2s, v3.2s
185+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
186+
; CHECK-NEXT: ret
187+
entry:
188+
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
189+
%vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %c, <2 x i32> %d)
190+
%sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i
191+
%shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer
192+
ret <1 x i64> %shuffle.i
193+
}
194+
195+
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
196+
197+
declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
198+
199+
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
200+
201+
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
202+
203+
declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
204+
205+
declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)

0 commit comments

Comments
 (0)