Skip to content

Commit d5fd3d9

Browse files
committed
[AArch64] Match pairwise add/fadd pattern
D75689 turns the faddp pattern into a shuffle with vector add. Match this new pattern in target-specific DAG combine, rather than ISel, because legalization (for v2f32) turns it into a bit of a mess. - extended to cover f16, f32, f64 and i64
1 parent 3ee87a9 commit d5fd3d9

File tree

5 files changed

+81
-51
lines changed

5 files changed

+81
-51
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
745745
setTargetDAGCombine(ISD::INTRINSIC_VOID);
746746
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
747747
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
748+
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
748749

749750
setTargetDAGCombine(ISD::GlobalAddress);
750751

@@ -11602,6 +11603,60 @@ performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1160211603
return ResultHADD;
1160311604
}
1160411605

11606+
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
11607+
switch (Opcode) {
11608+
case ISD::FADD:
11609+
return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
11610+
case ISD::ADD:
11611+
return VT == MVT::i64;
11612+
default:
11613+
return false;
11614+
}
11615+
}
11616+
11617+
static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
11618+
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
11619+
ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
11620+
11621+
EVT VT = N->getValueType(0);
11622+
const bool FullFP16 =
11623+
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
11624+
11625+
// Rewrite for pairwise fadd pattern
11626+
// (f32 (extract_vector_elt
11627+
// (fadd (vXf32 Other)
11628+
// (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
11629+
// ->
11630+
// (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
11631+
// (extract_vector_elt (vXf32 Other) 1))
11632+
if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
11633+
hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
11634+
SDLoc DL(N0);
11635+
SDValue N00 = N0->getOperand(0);
11636+
SDValue N01 = N0->getOperand(1);
11637+
11638+
ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
11639+
SDValue Other = N00;
11640+
11641+
// And handle the commutative case.
11642+
if (!Shuffle) {
11643+
Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
11644+
Other = N01;
11645+
}
11646+
11647+
if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
11648+
Other == Shuffle->getOperand(0)) {
11649+
return DAG.getNode(N0->getOpcode(), DL, VT,
11650+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
11651+
DAG.getConstant(0, DL, MVT::i64)),
11652+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
11653+
DAG.getConstant(1, DL, MVT::i64)));
11654+
}
11655+
}
11656+
11657+
return SDValue();
11658+
}
11659+
1160511660
static SDValue performConcatVectorsCombine(SDNode *N,
1160611661
TargetLowering::DAGCombinerInfo &DCI,
1160711662
SelectionDAG &DAG) {
@@ -14425,6 +14480,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1442514480
return performUzpCombine(N, DAG);
1442614481
case ISD::INSERT_VECTOR_ELT:
1442714482
return performPostLD1Combine(N, DCI, true);
14483+
case ISD::EXTRACT_VECTOR_ELT:
14484+
return performExtractVectorEltCombine(N, DAG);
1442814485
case ISD::INTRINSIC_VOID:
1442914486
case ISD::INTRINSIC_W_CHAIN:
1443014487
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7482,6 +7482,9 @@ def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
74827482
def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
74837483
(vector_extract (v4f32 FPR128:$Rn), (i64 1))),
74847484
(f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
7485+
def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
7486+
(vector_extract (v8f16 FPR128:$Rn), (i64 1))),
7487+
(f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
74857488

74867489
// Scalar 64-bit shifts in FPR64 registers.
74877490
def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),

llvm/test/CodeGen/AArch64/faddp-half.ll

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@ define half @faddp_2xhalf(<2 x half> %a) {
66
; CHECK-LABEL: faddp_2xhalf:
77
; CHECK: // %bb.0: // %entry
88
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
9-
; CHECK-NEXT: dup v1.4h, v0.h[1]
10-
; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
11-
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
9+
; CHECK-NEXT: faddp h0, v0.2h
1210
; CHECK-NEXT: ret
1311
;
1412
; CHECKNOFP16-LABEL: faddp_2xhalf:
@@ -32,9 +30,7 @@ define half @faddp_2xhalf_commute(<2 x half> %a) {
3230
; CHECK-LABEL: faddp_2xhalf_commute:
3331
; CHECK: // %bb.0: // %entry
3432
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
35-
; CHECK-NEXT: dup v1.4h, v0.h[1]
36-
; CHECK-NEXT: fadd v0.4h, v1.4h, v0.4h
37-
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
33+
; CHECK-NEXT: faddp h0, v0.2h
3834
; CHECK-NEXT: ret
3935
;
4036
; CHECKNOFP16-LABEL: faddp_2xhalf_commute:
@@ -58,9 +54,7 @@ define half @faddp_4xhalf(<4 x half> %a) {
5854
; CHECK-LABEL: faddp_4xhalf:
5955
; CHECK: // %bb.0: // %entry
6056
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
61-
; CHECK-NEXT: dup v1.4h, v0.h[1]
62-
; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
63-
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
57+
; CHECK-NEXT: faddp h0, v0.2h
6458
; CHECK-NEXT: ret
6559
;
6660
; CHECKNOFP16-LABEL: faddp_4xhalf:
@@ -84,9 +78,7 @@ define half @faddp_4xhalf_commute(<4 x half> %a) {
8478
; CHECK-LABEL: faddp_4xhalf_commute:
8579
; CHECK: // %bb.0: // %entry
8680
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
87-
; CHECK-NEXT: dup v1.4h, v0.h[1]
88-
; CHECK-NEXT: fadd v0.4h, v1.4h, v0.4h
89-
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
81+
; CHECK-NEXT: faddp h0, v0.2h
9082
; CHECK-NEXT: ret
9183
;
9284
; CHECKNOFP16-LABEL: faddp_4xhalf_commute:
@@ -109,9 +101,7 @@ entry:
109101
define half @faddp_8xhalf(<8 x half> %a) {
110102
; CHECK-LABEL: faddp_8xhalf:
111103
; CHECK: // %bb.0: // %entry
112-
; CHECK-NEXT: dup v1.8h, v0.h[1]
113-
; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
114-
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
104+
; CHECK-NEXT: faddp h0, v0.2h
115105
; CHECK-NEXT: ret
116106
;
117107
; CHECKNOFP16-LABEL: faddp_8xhalf:
@@ -132,9 +122,7 @@ entry:
132122
define half @faddp_8xhalf_commute(<8 x half> %a) {
133123
; CHECK-LABEL: faddp_8xhalf_commute:
134124
; CHECK: // %bb.0: // %entry
135-
; CHECK-NEXT: dup v1.8h, v0.h[1]
136-
; CHECK-NEXT: fadd v0.8h, v1.8h, v0.8h
137-
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
125+
; CHECK-NEXT: faddp h0, v0.2h
138126
; CHECK-NEXT: ret
139127
;
140128
; CHECKNOFP16-LABEL: faddp_8xhalf_commute:

llvm/test/CodeGen/AArch64/faddp.ll

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@ define float @faddp_2xfloat(<2 x float> %a) {
55
; CHECK-LABEL: faddp_2xfloat:
66
; CHECK: // %bb.0: // %entry
77
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
8-
; CHECK-NEXT: dup v1.2s, v0.s[1]
9-
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
10-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
8+
; CHECK-NEXT: faddp s0, v0.2s
119
; CHECK-NEXT: ret
1210
entry:
1311
%shift = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
@@ -19,9 +17,7 @@ entry:
1917
define float @faddp_4xfloat(<4 x float> %a) {
2018
; CHECK-LABEL: faddp_4xfloat:
2119
; CHECK: // %bb.0: // %entry
22-
; CHECK-NEXT: dup v1.4s, v0.s[1]
23-
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
24-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
20+
; CHECK-NEXT: faddp s0, v0.2s
2521
; CHECK-NEXT: ret
2622
entry:
2723
%shift = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -33,9 +29,7 @@ entry:
3329
define float @faddp_4xfloat_commute(<4 x float> %a) {
3430
; CHECK-LABEL: faddp_4xfloat_commute:
3531
; CHECK: // %bb.0: // %entry
36-
; CHECK-NEXT: dup v1.4s, v0.s[1]
37-
; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
38-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
32+
; CHECK-NEXT: faddp s0, v0.2s
3933
; CHECK-NEXT: ret
4034
entry:
4135
%shift = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -48,9 +42,7 @@ define float @faddp_2xfloat_commute(<2 x float> %a) {
4842
; CHECK-LABEL: faddp_2xfloat_commute:
4943
; CHECK: // %bb.0: // %entry
5044
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
51-
; CHECK-NEXT: dup v1.2s, v0.s[1]
52-
; CHECK-NEXT: fadd v0.2s, v1.2s, v0.2s
53-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
45+
; CHECK-NEXT: faddp s0, v0.2s
5446
; CHECK-NEXT: ret
5547
entry:
5648
%shift = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
@@ -62,9 +54,7 @@ entry:
6254
define double @faddp_2xdouble(<2 x double> %a) {
6355
; CHECK-LABEL: faddp_2xdouble:
6456
; CHECK: // %bb.0: // %entry
65-
; CHECK-NEXT: dup v1.2d, v0.d[1]
66-
; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
67-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
57+
; CHECK-NEXT: faddp d0, v0.2d
6858
; CHECK-NEXT: ret
6959
entry:
7060
%shift = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -76,9 +66,7 @@ entry:
7666
define double @faddp_2xdouble_commute(<2 x double> %a) {
7767
; CHECK-LABEL: faddp_2xdouble_commute:
7868
; CHECK: // %bb.0: // %entry
79-
; CHECK-NEXT: dup v1.2d, v0.d[1]
80-
; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d
81-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
69+
; CHECK-NEXT: faddp d0, v0.2d
8270
; CHECK-NEXT: ret
8371
entry:
8472
%shift = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -90,8 +78,7 @@ entry:
9078
define i64 @addp_2xi64(<2 x i64> %a) {
9179
; CHECK-LABEL: addp_2xi64:
9280
; CHECK: // %bb.0: // %entry
93-
; CHECK-NEXT: dup v1.2d, v0.d[1]
94-
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
81+
; CHECK-NEXT: addp d0, v0.2d
9582
; CHECK-NEXT: fmov x0, d0
9683
; CHECK-NEXT: ret
9784
entry:
@@ -104,8 +91,7 @@ entry:
10491
define i64 @addp_2xi64_commute(<2 x i64> %a) {
10592
; CHECK-LABEL: addp_2xi64_commute:
10693
; CHECK: // %bb.0: // %entry
107-
; CHECK-NEXT: dup v1.2d, v0.d[1]
108-
; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
94+
; CHECK-NEXT: addp d0, v0.2d
10995
; CHECK-NEXT: fmov x0, d0
11096
; CHECK-NEXT: ret
11197
entry:

llvm/test/CodeGen/AArch64/vecreduce-fadd.ll

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,9 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
2222
; CHECK-LABEL: add_HalfH:
2323
; CHECK: // %bb.0:
2424
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
25-
; CHECK-NEXT: mov h3, v0.h[1]
2625
; CHECK-NEXT: mov h1, v0.h[3]
2726
; CHECK-NEXT: mov h2, v0.h[2]
28-
; CHECK-NEXT: fadd h0, h0, h3
27+
; CHECK-NEXT: faddp h0, v0.2h
2928
; CHECK-NEXT: fadd h0, h0, h2
3029
; CHECK-NEXT: fadd h0, h0, h1
3130
; CHECK-NEXT: ret
@@ -59,10 +58,9 @@ define half @add_H(<8 x half> %bin.rdx) {
5958
; CHECK: // %bb.0:
6059
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
6160
; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
62-
; CHECK-NEXT: mov h1, v0.h[1]
63-
; CHECK-NEXT: mov h2, v0.h[2]
64-
; CHECK-NEXT: fadd h1, h0, h1
65-
; CHECK-NEXT: fadd h1, h1, h2
61+
; CHECK-NEXT: mov h1, v0.h[2]
62+
; CHECK-NEXT: faddp h2, v0.2h
63+
; CHECK-NEXT: fadd h1, h2, h1
6664
; CHECK-NEXT: mov h0, v0.h[3]
6765
; CHECK-NEXT: fadd h0, h1, h0
6866
; CHECK-NEXT: ret
@@ -105,7 +103,6 @@ define half @add_H(<8 x half> %bin.rdx) {
105103
; CHECKNOFP16-NEXT: fadd s0, s0, s1
106104
; CHECKNOFP16-NEXT: fcvt h0, s0
107105
; CHECKNOFP16-NEXT: ret
108-
109106
%r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
110107
ret half %r
111108
}
@@ -148,10 +145,9 @@ define half @add_2H(<16 x half> %bin.rdx) {
148145
; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
149146
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
150147
; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
151-
; CHECK-NEXT: mov h1, v0.h[1]
152-
; CHECK-NEXT: mov h2, v0.h[2]
153-
; CHECK-NEXT: fadd h1, h0, h1
154-
; CHECK-NEXT: fadd h1, h1, h2
148+
; CHECK-NEXT: mov h1, v0.h[2]
149+
; CHECK-NEXT: faddp h2, v0.2h
150+
; CHECK-NEXT: fadd h1, h2, h1
155151
; CHECK-NEXT: mov h0, v0.h[3]
156152
; CHECK-NEXT: fadd h0, h1, h0
157153
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)