Skip to content

Commit 264b1b2

Browse files
committed
[ARM] Convert vector fdiv+fcvt fixed-point combine to fmul.
Instcombine will convert fdiv by a power-2 to fmul, this converts the PerformVDIVCombine that converts fdiv+fcvt to fixed-point fcvt to fmul+fcvt. The fdiv tests will look worse, but won't appear in practice (and should be improved again by #93882).
1 parent b71434f commit 264b1b2

File tree

2 files changed

+113
-60
lines changed

2 files changed

+113
-60
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,7 +1000,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
10001000
}
10011001

10021002
setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,
1003-
ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD});
1003+
ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
10041004

10051005
// It is legal to extload from v4i8 to v4i16 or v4i32.
10061006
for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
@@ -17011,17 +17011,17 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
1701117011
return SDValue();
1701217012
}
1701317013

17014-
/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17015-
/// can replace combinations of VCVT (integer to floating-point) and VDIV
17016-
/// when the VDIV has a constant operand that is a power of 2.
17014+
/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17015+
/// can replace combinations of VCVT (integer to floating-point) and VMUL
17016+
/// when the VMUL has a constant operand that is a power of 2.
1701717017
///
17018-
/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
17018+
/// Example (assume d17 = <float 0.125, float 0.125>):
1701917019
/// vcvt.f32.s32 d16, d16
17020-
/// vdiv.f32 d16, d17, d16
17020+
/// vmul.f32 d16, d16, d17
1702117021
/// becomes:
1702217022
/// vcvt.f32.s32 d16, d16, #3
17023-
static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
17024-
const ARMSubtarget *Subtarget) {
17023+
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG,
17024+
const ARMSubtarget *Subtarget) {
1702517025
if (!Subtarget->hasNEON())
1702617026
return SDValue();
1702717027

@@ -17048,26 +17048,34 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
1704817048
return SDValue();
1704917049
}
1705017050

17051-
BitVector UndefElements;
17052-
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17053-
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
17051+
ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17052+
APFloat Recip(0.0f);
17053+
if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17054+
return SDValue();
17055+
17056+
bool IsExact;
17057+
APSInt IntVal(33);
17058+
if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17059+
APFloat::opOK ||
17060+
!IsExact)
17061+
return SDValue();
17062+
17063+
int32_t C = IntVal.exactLogBase2();
1705417064
if (C == -1 || C == 0 || C > 32)
1705517065
return SDValue();
1705617066

17057-
SDLoc dl(N);
17067+
SDLoc DL(N);
1705817068
bool isSigned = OpOpcode == ISD::SINT_TO_FP;
1705917069
SDValue ConvInput = Op.getOperand(0);
1706017070
if (IntBits < FloatBits)
17061-
ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
17062-
dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
17063-
ConvInput);
17071+
ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17072+
NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
1706417073

17065-
unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
17066-
Intrinsic::arm_neon_vcvtfxu2fp;
17067-
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
17068-
Op.getValueType(),
17069-
DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
17070-
ConvInput, DAG.getConstant(C, dl, MVT::i32));
17074+
unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17075+
: Intrinsic::arm_neon_vcvtfxu2fp;
17076+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17077+
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17078+
DAG.getConstant(C, DL, MVT::i32));
1707117079
}
1707217080

1707317081
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
@@ -18897,8 +18905,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
1889718905
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
1889818906
case ISD::FADD:
1889918907
return PerformFADDCombine(N, DCI.DAG, Subtarget);
18900-
case ISD::FDIV:
18901-
return PerformVDIVCombine(N, DCI.DAG, Subtarget);
18908+
case ISD::FMUL:
18909+
return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
1890218910
case ISD::INTRINSIC_WO_CHAIN:
1890318911
return PerformIntrinsicCombine(N, DCI);
1890418912
case ISD::SHL:

llvm/test/CodeGen/ARM/vdiv_combine.ll

Lines changed: 82 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
define arm_aapcs_vfpcc <2 x float> @t1(<2 x i32> %vecinit2.i) nounwind {
66
; CHECK-LABEL: t1:
77
; CHECK: @ %bb.0: @ %entry
8-
; CHECK-NEXT: vcvt.f32.s32 d0, d0, #3
8+
; CHECK-NEXT: vmov.f32 s2, #8.000000e+00
9+
; CHECK-NEXT: vcvt.f32.s32 d2, d0
10+
; CHECK-NEXT: vdiv.f32 s1, s5, s2
11+
; CHECK-NEXT: vdiv.f32 s0, s4, s2
912
; CHECK-NEXT: bx lr
1013
entry:
1114
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -17,7 +20,10 @@ entry:
1720
define arm_aapcs_vfpcc <2 x float> @t2(<2 x i32> %vecinit2.i) nounwind {
1821
; CHECK-LABEL: t2:
1922
; CHECK: @ %bb.0: @ %entry
20-
; CHECK-NEXT: vcvt.f32.u32 d0, d0, #3
23+
; CHECK-NEXT: vmov.f32 s2, #8.000000e+00
24+
; CHECK-NEXT: vcvt.f32.u32 d2, d0
25+
; CHECK-NEXT: vdiv.f32 s1, s5, s2
26+
; CHECK-NEXT: vdiv.f32 s0, s4, s2
2127
; CHECK-NEXT: bx lr
2228
entry:
2329
%vcvt.i = uitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -71,8 +77,17 @@ entry:
7177
define arm_aapcs_vfpcc <2 x float> @t5(<2 x i32> %vecinit2.i) nounwind {
7278
; CHECK-LABEL: t5:
7379
; CHECK: @ %bb.0: @ %entry
74-
; CHECK-NEXT: vcvt.f32.s32 d0, d0, #32
80+
; CHECK-NEXT: vcvt.f32.s32 d2, d0
81+
; CHECK-NEXT: vldr s2, LCPI4_0
82+
; CHECK-NEXT: vdiv.f32 s1, s5, s2
83+
; CHECK-NEXT: vdiv.f32 s0, s4, s2
7584
; CHECK-NEXT: bx lr
85+
; CHECK-NEXT: .p2align 2
86+
; CHECK-NEXT: @ %bb.1:
87+
; CHECK-NEXT: .data_region
88+
; CHECK-NEXT: LCPI4_0:
89+
; CHECK-NEXT: .long 0x4f800000 @ float 4.2949673E+9
90+
; CHECK-NEXT: .end_data_region
7691
entry:
7792
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
7893
%div.i = fdiv <2 x float> %vcvt.i, <float 0x41F0000000000000, float 0x41F0000000000000>
@@ -83,7 +98,12 @@ entry:
8398
define arm_aapcs_vfpcc <4 x float> @t6(<4 x i32> %vecinit6.i) nounwind {
8499
; CHECK-LABEL: t6:
85100
; CHECK: @ %bb.0: @ %entry
86-
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
101+
; CHECK-NEXT: vmov.f32 s4, #8.000000e+00
102+
; CHECK-NEXT: vcvt.f32.s32 q2, q0
103+
; CHECK-NEXT: vdiv.f32 s3, s11, s4
104+
; CHECK-NEXT: vdiv.f32 s2, s10, s4
105+
; CHECK-NEXT: vdiv.f32 s1, s9, s4
106+
; CHECK-NEXT: vdiv.f32 s0, s8, s4
87107
; CHECK-NEXT: bx lr
88108
entry:
89109
%vcvt.i = sitofp <4 x i32> %vecinit6.i to <4 x float>
@@ -95,7 +115,12 @@ define arm_aapcs_vfpcc <4 x float> @fix_unsigned_i16_to_float(<4 x i16> %in) {
95115
; CHECK-LABEL: fix_unsigned_i16_to_float:
96116
; CHECK: @ %bb.0:
97117
; CHECK-NEXT: vmovl.u16 q8, d0
98-
; CHECK-NEXT: vcvt.f32.u32 q0, q8, #1
118+
; CHECK-NEXT: vmov.f32 s4, #2.000000e+00
119+
; CHECK-NEXT: vcvt.f32.u32 q2, q8
120+
; CHECK-NEXT: vdiv.f32 s3, s11, s4
121+
; CHECK-NEXT: vdiv.f32 s2, s10, s4
122+
; CHECK-NEXT: vdiv.f32 s1, s9, s4
123+
; CHECK-NEXT: vdiv.f32 s0, s8, s4
99124
; CHECK-NEXT: bx lr
100125
%conv = uitofp <4 x i16> %in to <4 x float>
101126
%shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
@@ -106,7 +131,12 @@ define arm_aapcs_vfpcc <4 x float> @fix_signed_i16_to_float(<4 x i16> %in) {
106131
; CHECK-LABEL: fix_signed_i16_to_float:
107132
; CHECK: @ %bb.0:
108133
; CHECK-NEXT: vmovl.s16 q8, d0
109-
; CHECK-NEXT: vcvt.f32.s32 q0, q8, #1
134+
; CHECK-NEXT: vmov.f32 s4, #2.000000e+00
135+
; CHECK-NEXT: vcvt.f32.s32 q2, q8
136+
; CHECK-NEXT: vdiv.f32 s3, s11, s4
137+
; CHECK-NEXT: vdiv.f32 s2, s10, s4
138+
; CHECK-NEXT: vdiv.f32 s1, s9, s4
139+
; CHECK-NEXT: vdiv.f32 s0, s8, s4
110140
; CHECK-NEXT: bx lr
111141
%conv = sitofp <4 x i16> %in to <4 x float>
112142
%shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
@@ -166,8 +196,19 @@ define arm_aapcs_vfpcc <2 x double> @fix_i64_to_double(<2 x i64> %in) {
166196
define arm_aapcs_vfpcc <8 x float> @test7(<8 x i32> %in) nounwind {
167197
; CHECK-LABEL: test7:
168198
; CHECK: @ %bb.0: @ %entry
169-
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
170-
; CHECK-NEXT: vcvt.f32.s32 q1, q1, #3
199+
; CHECK-NEXT: vpush {d8, d9}
200+
; CHECK-NEXT: vmov.f32 s12, #8.000000e+00
201+
; CHECK-NEXT: vcvt.f32.s32 q4, q0
202+
; CHECK-NEXT: vcvt.f32.s32 q2, q1
203+
; CHECK-NEXT: vdiv.f32 s3, s19, s12
204+
; CHECK-NEXT: vdiv.f32 s7, s11, s12
205+
; CHECK-NEXT: vdiv.f32 s2, s18, s12
206+
; CHECK-NEXT: vdiv.f32 s6, s10, s12
207+
; CHECK-NEXT: vdiv.f32 s1, s17, s12
208+
; CHECK-NEXT: vdiv.f32 s5, s9, s12
209+
; CHECK-NEXT: vdiv.f32 s0, s16, s12
210+
; CHECK-NEXT: vdiv.f32 s4, s8, s12
211+
; CHECK-NEXT: vpop {d8, d9}
171212
; CHECK-NEXT: bx lr
172213
entry:
173214
%vcvt.i = sitofp <8 x i32> %in to <8 x float>
@@ -179,8 +220,19 @@ entry:
179220
define arm_aapcs_vfpcc <4 x float> @test8(<4 x i32> %in) {
180221
; CHECK-LABEL: test8:
181222
; CHECK: @ %bb.0:
182-
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #1
223+
; CHECK-NEXT: vmov.f32 s4, #2.000000e+00
224+
; CHECK-NEXT: vcvt.f32.s32 q2, q0
225+
; CHECK-NEXT: vdiv.f32 s2, s10, s4
226+
; CHECK-NEXT: vdiv.f32 s1, s9, s4
227+
; CHECK-NEXT: vdiv.f32 s0, s8, s4
228+
; CHECK-NEXT: vldr s3, LCPI11_0
183229
; CHECK-NEXT: bx lr
230+
; CHECK-NEXT: .p2align 2
231+
; CHECK-NEXT: @ %bb.1:
232+
; CHECK-NEXT: .data_region
233+
; CHECK-NEXT: LCPI11_0:
234+
; CHECK-NEXT: .long 0x7fc00000 @ float NaN
235+
; CHECK-NEXT: .end_data_region
184236
%vcvt.i = sitofp <4 x i32> %in to <4 x float>
185237
%div.i = fdiv <4 x float> %vcvt.i, <float 2.0, float 2.0, float 2.0, float undef>
186238
ret <4 x float> %div.i
@@ -189,8 +241,19 @@ define arm_aapcs_vfpcc <4 x float> @test8(<4 x i32> %in) {
189241
define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp(<3 x i32> %in) {
190242
; CHECK-LABEL: test_illegal_int_to_fp:
191243
; CHECK: @ %bb.0:
192-
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #2
244+
; CHECK-NEXT: vmov.f32 s4, #4.000000e+00
245+
; CHECK-NEXT: vcvt.f32.s32 q2, q0
246+
; CHECK-NEXT: vdiv.f32 s2, s10, s4
247+
; CHECK-NEXT: vdiv.f32 s1, s9, s4
248+
; CHECK-NEXT: vdiv.f32 s0, s8, s4
249+
; CHECK-NEXT: vldr s3, LCPI12_0
193250
; CHECK-NEXT: bx lr
251+
; CHECK-NEXT: .p2align 2
252+
; CHECK-NEXT: @ %bb.1:
253+
; CHECK-NEXT: .data_region
254+
; CHECK-NEXT: LCPI12_0:
255+
; CHECK-NEXT: .long 0x7fc00000 @ float NaN
256+
; CHECK-NEXT: .end_data_region
194257
%conv = sitofp <3 x i32> %in to <3 x float>
195258
%res = fdiv <3 x float> %conv, <float 4.0, float 4.0, float 4.0>
196259
ret <3 x float> %res
@@ -200,9 +263,7 @@ define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp(<3 x i32> %in) {
200263
define arm_aapcs_vfpcc <2 x float> @t1_mul(<2 x i32> %vecinit2.i) local_unnamed_addr #0 {
201264
; CHECK-LABEL: t1_mul:
202265
; CHECK: @ %bb.0: @ %entry
203-
; CHECK-NEXT: vcvt.f32.s32 d16, d0
204-
; CHECK-NEXT: vmov.i32 d17, #0x3e000000
205-
; CHECK-NEXT: vmul.f32 d0, d16, d17
266+
; CHECK-NEXT: vcvt.f32.s32 d0, d0, #3
206267
; CHECK-NEXT: bx lr
207268
entry:
208269
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -213,9 +274,7 @@ entry:
213274
define arm_aapcs_vfpcc <2 x float> @t2_mul(<2 x i32> %vecinit2.i) local_unnamed_addr #0 {
214275
; CHECK-LABEL: t2_mul:
215276
; CHECK: @ %bb.0: @ %entry
216-
; CHECK-NEXT: vcvt.f32.u32 d16, d0
217-
; CHECK-NEXT: vmov.i32 d17, #0x3e000000
218-
; CHECK-NEXT: vmul.f32 d0, d16, d17
277+
; CHECK-NEXT: vcvt.f32.u32 d0, d0, #3
219278
; CHECK-NEXT: bx lr
220279
entry:
221280
%vcvt.i = uitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -239,10 +298,7 @@ entry:
239298
define arm_aapcs_vfpcc <2 x float> @t5_mul(<2 x i32> %vecinit2.i) local_unnamed_addr #0 {
240299
; CHECK-LABEL: t5_mul:
241300
; CHECK: @ %bb.0: @ %entry
242-
; CHECK-NEXT: vcvt.f32.s32 d16, d0
243-
; CHECK-NEXT: mov r0, #796917760
244-
; CHECK-NEXT: vdup.32 d17, r0
245-
; CHECK-NEXT: vmul.f32 d0, d16, d17
301+
; CHECK-NEXT: vcvt.f32.s32 d0, d0, #32
246302
; CHECK-NEXT: bx lr
247303
entry:
248304
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -253,9 +309,7 @@ entry:
253309
define arm_aapcs_vfpcc <4 x float> @t6_mul(<4 x i32> %vecinit6.i) local_unnamed_addr #0 {
254310
; CHECK-LABEL: t6_mul:
255311
; CHECK: @ %bb.0: @ %entry
256-
; CHECK-NEXT: vcvt.f32.s32 q8, q0
257-
; CHECK-NEXT: vmov.i32 q9, #0x3e000000
258-
; CHECK-NEXT: vmul.f32 q0, q8, q9
312+
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
259313
; CHECK-NEXT: bx lr
260314
entry:
261315
%vcvt.i = sitofp <4 x i32> %vecinit6.i to <4 x float>
@@ -267,9 +321,7 @@ define arm_aapcs_vfpcc <4 x float> @fix_unsigned_i16_to_float_mul(<4 x i16> %in)
267321
; CHECK-LABEL: fix_unsigned_i16_to_float_mul:
268322
; CHECK: @ %bb.0:
269323
; CHECK-NEXT: vmovl.u16 q8, d0
270-
; CHECK-NEXT: vmov.i32 q9, #0x3f000000
271-
; CHECK-NEXT: vcvt.f32.u32 q8, q8
272-
; CHECK-NEXT: vmul.f32 q0, q8, q9
324+
; CHECK-NEXT: vcvt.f32.u32 q0, q8, #1
273325
; CHECK-NEXT: bx lr
274326
%conv = uitofp <4 x i16> %in to <4 x float>
275327
%shift = fmul <4 x float> %conv, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
@@ -280,9 +332,7 @@ define arm_aapcs_vfpcc <4 x float> @fix_signed_i16_to_float_mul(<4 x i16> %in) l
280332
; CHECK-LABEL: fix_signed_i16_to_float_mul:
281333
; CHECK: @ %bb.0:
282334
; CHECK-NEXT: vmovl.s16 q8, d0
283-
; CHECK-NEXT: vmov.i32 q9, #0x3f000000
284-
; CHECK-NEXT: vcvt.f32.s32 q8, q8
285-
; CHECK-NEXT: vmul.f32 q0, q8, q9
335+
; CHECK-NEXT: vcvt.f32.s32 q0, q8, #1
286336
; CHECK-NEXT: bx lr
287337
%conv = sitofp <4 x i16> %in to <4 x float>
288338
%shift = fmul <4 x float> %conv, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
@@ -340,11 +390,8 @@ define arm_aapcs_vfpcc <2 x double> @fix_i64_to_double_mul(<2 x i64> %in) local_
340390
define arm_aapcs_vfpcc <8 x float> @test7_mul(<8 x i32> %in) local_unnamed_addr #0 {
341391
; CHECK-LABEL: test7_mul:
342392
; CHECK: @ %bb.0: @ %entry
343-
; CHECK-NEXT: vcvt.f32.s32 q8, q0
344-
; CHECK-NEXT: vcvt.f32.s32 q9, q1
345-
; CHECK-NEXT: vmov.i32 q10, #0x3e000000
346-
; CHECK-NEXT: vmul.f32 q0, q8, q10
347-
; CHECK-NEXT: vmul.f32 q1, q9, q10
393+
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
394+
; CHECK-NEXT: vcvt.f32.s32 q1, q1, #3
348395
; CHECK-NEXT: bx lr
349396
entry:
350397
%vcvt.i = sitofp <8 x i32> %in to <8 x float>
@@ -355,9 +402,7 @@ entry:
355402
define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp_mul(<3 x i32> %in) local_unnamed_addr #0 {
356403
; CHECK-LABEL: test_illegal_int_to_fp_mul:
357404
; CHECK: @ %bb.0:
358-
; CHECK-NEXT: vcvt.f32.s32 q8, q0
359-
; CHECK-NEXT: vmov.f32 q9, #2.500000e-01
360-
; CHECK-NEXT: vmul.f32 q0, q8, q9
405+
; CHECK-NEXT: vcvt.f32.s32 q0, q0, #2
361406
; CHECK-NEXT: bx lr
362407
%conv = sitofp <3 x i32> %in to <3 x float>
363408
%res = fmul <3 x float> %conv, <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>

0 commit comments

Comments
 (0)