Skip to content

Commit c33fd3b

Browse files
committed
[AArch64] Lower all fp zero buildvectors through BUILD_VECTOR.
Just like with integers, we can treat zero fp buildvector as legal so that they can be recognized in tablegen patterns using immAllZerosV.
1 parent 314e431 commit c33fd3b

File tree

3 files changed

+189
-22
lines changed

3 files changed

+189
-22
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12230,20 +12230,22 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
1223012230
if (Op.getOpcode() != ISD::BUILD_VECTOR)
1223112231
return SDValue();
1223212232

12233-
if (VT.isInteger()) {
12234-
// Certain vector constants, used to express things like logical NOT and
12235-
// arithmetic NEG, are passed through unmodified. This allows special
12236-
// patterns for these operations to match, which will lower these constants
12237-
// to whatever is proven necessary.
12238-
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12239-
if (BVN->isConstant())
12240-
if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
12241-
unsigned BitSize = VT.getVectorElementType().getSizeInBits();
12242-
APInt Val(BitSize,
12243-
Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
12244-
if (Val.isZero() || Val.isAllOnes())
12245-
return Op;
12246-
}
12233+
// Certain vector constants, used to express things like logical NOT and
12234+
// arithmetic NEG, are passed through unmodified. This allows special
12235+
// patterns for these operations to match, which will lower these constants
12236+
// to whatever is proven necessary.
12237+
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12238+
if (BVN->isConstant()) {
12239+
if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
12240+
unsigned BitSize = VT.getVectorElementType().getSizeInBits();
12241+
APInt Val(BitSize,
12242+
Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
12243+
if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
12244+
return Op;
12245+
}
12246+
if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
12247+
if (Const->isZero() && !Const->isNegative())
12248+
return Op;
1224712249
}
1224812250

1224912251
if (SDValue V = ConstantBuildVector(Op, DAG))
@@ -12445,7 +12447,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
1244512447
APInt ConstantValueAPInt(1, 0);
1244612448
if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
1244712449
ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
12448-
if (!isNullConstant(ConstantValue) && !ConstantValueAPInt.isAllOnes()) {
12450+
if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
12451+
!ConstantValueAPInt.isAllOnes()) {
1244912452
Val = ConstantBuildVector(Val, DAG);
1245012453
if (!Val)
1245112454
// Otherwise, materialize the constant and splat it.

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6447,6 +6447,10 @@ def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
64476447
def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
64486448
def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
64496449
def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
6450+
def : Pat<(v2f64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
6451+
def : Pat<(v4f32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
6452+
def : Pat<(v8f16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
6453+
def : Pat<(v8bf16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
64506454

64516455
def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
64526456
def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
@@ -6459,6 +6463,10 @@ def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
64596463
def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
64606464
def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
64616465
def : Pat<(v8i8 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
6466+
def : Pat<(v1f64 immAllZerosV), (MOVID (i32 0))>;
6467+
def : Pat<(v2f32 immAllZerosV), (MOVID (i32 0))>;
6468+
def : Pat<(v4f16 immAllZerosV), (MOVID (i32 0))>;
6469+
def : Pat<(v4bf16 immAllZerosV), (MOVID (i32 0))>;
64626470

64636471
def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
64646472
def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;

llvm/test/CodeGen/AArch64/arm64-build-vector.ll

Lines changed: 163 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s
2+
; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+fullfp16,+bf16 | FileCheck %s
33

44
; Check that building a vector from floats doesn't insert an unnecessary
55
; copy for lane zero.
@@ -10,9 +10,9 @@ define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind {
1010
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
1111
; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
1212
; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3
13-
; CHECK-NEXT: mov.s v0[1], v1[0]
14-
; CHECK-NEXT: mov.s v0[2], v2[0]
15-
; CHECK-NEXT: mov.s v0[3], v3[0]
13+
; CHECK-NEXT: mov v0.s[1], v1.s[0]
14+
; CHECK-NEXT: mov v0.s[2], v2.s[0]
15+
; CHECK-NEXT: mov v0.s[3], v3.s[0]
1616
; CHECK-NEXT: ret
1717
%1 = insertelement <4 x float> undef, float %a, i32 0
1818
%2 = insertelement <4 x float> %1, float %b, i32 1
@@ -26,7 +26,7 @@ define <8 x i16> @build_all_zero(<8 x i16> %a) #1 {
2626
; CHECK: // %bb.0:
2727
; CHECK-NEXT: mov w8, #44672
2828
; CHECK-NEXT: fmov s1, w8
29-
; CHECK-NEXT: mul.8h v0, v0, v1
29+
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
3030
; CHECK-NEXT: ret
3131
%b = add <8 x i16> %a, <i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
3232
%c = mul <8 x i16> %b, <i16 -20864, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
@@ -41,7 +41,7 @@ define <8 x i16> @build_all_zero(<8 x i16> %a) #1 {
4141
define <8 x i16> @concat_2_build_vector(<4 x i16> %in0) {
4242
; CHECK-LABEL: concat_2_build_vector:
4343
; CHECK: // %bb.0:
44-
; CHECK-NEXT: movi.2d v0, #0000000000000000
44+
; CHECK-NEXT: movi v0.2d, #0000000000000000
4545
; CHECK-NEXT: ret
4646
%vshl_n = shl <4 x i16> %in0, <i16 8, i16 8, i16 8, i16 8>
4747
%vshl_n2 = shl <4 x i16> %vshl_n, <i16 9, i16 9, i16 9, i16 9>
@@ -98,9 +98,165 @@ define <1 x double> @convert_single_fp_vector_constant(i1 %cmp) {
9898
; CHECK-NEXT: csetm x9, ne
9999
; CHECK-NEXT: fmov d0, x8
100100
; CHECK-NEXT: fmov d1, x9
101-
; CHECK-NEXT: and.8b v0, v0, v1
101+
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
102102
; CHECK-NEXT: ret
103103
entry:
104104
%sel = select i1 %cmp, <1 x double> <double 1.000000e+00>, <1 x double> zeroinitializer
105105
ret <1 x double> %sel
106106
}
107+
108+
; All Zero and All -Zero tests.
109+
110+
define <2 x double> @poszero_v2f64(<2 x double> %a) {
111+
; CHECK-LABEL: poszero_v2f64:
112+
; CHECK: // %bb.0:
113+
; CHECK-NEXT: movi v1.2d, #0000000000000000
114+
; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
115+
; CHECK-NEXT: ret
116+
%b = fadd <2 x double> %a, <double 0.0, double 0.0>
117+
ret <2 x double> %b
118+
}
119+
120+
define <2 x double> @negzero_v2f64(<2 x double> %a) {
121+
; CHECK-LABEL: negzero_v2f64:
122+
; CHECK: // %bb.0:
123+
; CHECK-NEXT: mov x8, #-9223372036854775808
124+
; CHECK-NEXT: dup v1.2d, x8
125+
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
126+
; CHECK-NEXT: ret
127+
%b = fmul <2 x double> %a, <double -0.0, double -0.0>
128+
ret <2 x double> %b
129+
}
130+
131+
define <1 x double> @poszero_v1f64(<1 x double> %a) {
132+
; CHECK-LABEL: poszero_v1f64:
133+
; CHECK: // %bb.0:
134+
; CHECK-NEXT: movi d1, #0000000000000000
135+
; CHECK-NEXT: fadd d0, d0, d1
136+
; CHECK-NEXT: ret
137+
%b = fadd <1 x double> %a, <double 0.0>
138+
ret <1 x double> %b
139+
}
140+
141+
define <1 x double> @negzero_v1f64(<1 x double> %a) {
142+
; CHECK-LABEL: negzero_v1f64:
143+
; CHECK: // %bb.0:
144+
; CHECK-NEXT: mov x8, #-9223372036854775808
145+
; CHECK-NEXT: fmov d1, x8
146+
; CHECK-NEXT: fmul d0, d0, d1
147+
; CHECK-NEXT: ret
148+
%b = fmul <1 x double> %a, <double -0.0>
149+
ret <1 x double> %b
150+
}
151+
152+
define <4 x float> @poszero_v4f32(<4 x float> %a) {
153+
; CHECK-LABEL: poszero_v4f32:
154+
; CHECK: // %bb.0:
155+
; CHECK-NEXT: movi v1.2d, #0000000000000000
156+
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
157+
; CHECK-NEXT: ret
158+
%b = fadd <4 x float> %a, <float 0.0, float 0.0, float 0.0, float 0.0>
159+
ret <4 x float> %b
160+
}
161+
162+
define <4 x float> @negzero_v4f32(<4 x float> %a) {
163+
; CHECK-LABEL: negzero_v4f32:
164+
; CHECK: // %bb.0:
165+
; CHECK-NEXT: movi v1.4s, #128, lsl #24
166+
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
167+
; CHECK-NEXT: ret
168+
%b = fmul <4 x float> %a, <float -0.0, float -0.0, float -0.0, float -0.0>
169+
ret <4 x float> %b
170+
}
171+
172+
define <2 x float> @poszero_v2f32(<2 x float> %a) {
173+
; CHECK-LABEL: poszero_v2f32:
174+
; CHECK: // %bb.0:
175+
; CHECK-NEXT: movi d1, #0000000000000000
176+
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
177+
; CHECK-NEXT: ret
178+
%b = fadd <2 x float> %a, <float 0.0, float 0.0>
179+
ret <2 x float> %b
180+
}
181+
182+
define <2 x float> @negzero_v2f32(<2 x float> %a) {
183+
; CHECK-LABEL: negzero_v2f32:
184+
; CHECK: // %bb.0:
185+
; CHECK-NEXT: movi v1.2s, #128, lsl #24
186+
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
187+
; CHECK-NEXT: ret
188+
%b = fmul <2 x float> %a, <float -0.0, float -0.0>
189+
ret <2 x float> %b
190+
}
191+
192+
define <8 x half> @poszero_v8f16(<8 x half> %a) {
193+
; CHECK-LABEL: poszero_v8f16:
194+
; CHECK: // %bb.0:
195+
; CHECK-NEXT: movi v1.2d, #0000000000000000
196+
; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
197+
; CHECK-NEXT: ret
198+
%b = fadd <8 x half> %a, <half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0>
199+
ret <8 x half> %b
200+
}
201+
202+
define <8 x half> @negzero_v8f16(<8 x half> %a) {
203+
; CHECK-LABEL: negzero_v8f16:
204+
; CHECK: // %bb.0:
205+
; CHECK-NEXT: movi v1.8h, #128, lsl #8
206+
; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h
207+
; CHECK-NEXT: ret
208+
%b = fmul <8 x half> %a, <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>
209+
ret <8 x half> %b
210+
}
211+
212+
define <4 x half> @poszero_v4f16(<4 x half> %a) {
213+
; CHECK-LABEL: poszero_v4f16:
214+
; CHECK: // %bb.0:
215+
; CHECK-NEXT: movi d1, #0000000000000000
216+
; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
217+
; CHECK-NEXT: ret
218+
%b = fadd <4 x half> %a, <half 0.0, half 0.0, half 0.0, half 0.0>
219+
ret <4 x half> %b
220+
}
221+
222+
define <4 x half> @negzero_v4f16(<4 x half> %a) {
223+
; CHECK-LABEL: negzero_v4f16:
224+
; CHECK: // %bb.0:
225+
; CHECK-NEXT: movi v1.4h, #128, lsl #8
226+
; CHECK-NEXT: fmul v0.4h, v0.4h, v1.4h
227+
; CHECK-NEXT: ret
228+
%b = fmul <4 x half> %a, <half -0.0, half -0.0, half -0.0, half -0.0>
229+
ret <4 x half> %b
230+
}
231+
232+
define <8 x bfloat> @poszero_v8bf16(<8 x bfloat> %a) {
233+
; CHECK-LABEL: poszero_v8bf16:
234+
; CHECK: // %bb.0:
235+
; CHECK-NEXT: movi v0.2d, #0000000000000000
236+
; CHECK-NEXT: ret
237+
ret <8 x bfloat> <bfloat 0.0, bfloat 0.0, bfloat 0.0, bfloat 0.0, bfloat 0.0, bfloat 0.0, bfloat 0.0, bfloat 0.0>
238+
}
239+
240+
define <8 x bfloat> @negzero_v8bf16(<8 x bfloat> %a) {
241+
; CHECK-LABEL: negzero_v8bf16:
242+
; CHECK: // %bb.0:
243+
; CHECK-NEXT: movi v0.8h, #128, lsl #8
244+
; CHECK-NEXT: ret
245+
ret <8 x bfloat> <bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0>
246+
}
247+
248+
define <4 x bfloat> @poszero_v4bf16(<4 x bfloat> %a) {
249+
; CHECK-LABEL: poszero_v4bf16:
250+
; CHECK: // %bb.0:
251+
; CHECK-NEXT: movi d0, #0000000000000000
252+
; CHECK-NEXT: ret
253+
ret <4 x bfloat> <bfloat 0.0, bfloat 0.0, bfloat 0.0, bfloat 0.0>
254+
}
255+
256+
define <4 x bfloat> @negzero_v4bf16(<4 x bfloat> %a) {
257+
; CHECK-LABEL: negzero_v4bf16:
258+
; CHECK: // %bb.0:
259+
; CHECK-NEXT: movi v0.4h, #128, lsl #8
260+
; CHECK-NEXT: ret
261+
ret <4 x bfloat> <bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0>
262+
}

0 commit comments

Comments
 (0)