Skip to content

Commit 92a9bcc

Browse files
authored
[AArch64] Add tablegen patterns for fmla index with extract 0. (#114976)
We have tablegen patterns to produce an indexed `fmla s0, s1, v2.s[2]` from `fma extract(Rn, lane), Rm, Ra -> fmla` But for the case of lane==0, we want to prefer the simple `fmadd s0, s1, s2`. So we have patterns for `fma extract(Rn, 0), Rm, Ra -> fmadd` The problem arises when we have two extracts, as tablegen starts to prefer the second pattern, as it looks more specialized. This patch adds additional patterns to catch this case: `fma extract(Rn, index), extract(Rm, 0), Ra -> fmla` To make sure the simpler fmadd keeps being selected when both lanes are extracted from lane 0 we need to add patterns for that case too: `fma extract(Rn, 0), extract(Rm, 0), Ra -> fmadd`
1 parent 3356eb3 commit 92a9bcc

File tree

4 files changed

+50
-24
lines changed

4 files changed

+50
-24
lines changed

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5821,6 +5821,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
58215821
(f16 FPR16:$Ra))),
58225822
(!cast<Instruction>(NAME # Hrrr)
58235823
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), FPR16:$Rm, FPR16:$Ra)>;
5824+
5825+
def : Pat<(f16 (node (f16 (extractelt (v8f16 V128:$Rn), (i64 0))),
5826+
(f16 (extractelt (v8f16 V128:$Rm), (i64 0))),
5827+
(f16 FPR16:$Ra))),
5828+
(!cast<Instruction>(NAME # Hrrr)
5829+
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)),
5830+
(f16 (EXTRACT_SUBREG V128:$Rm, hsub)), FPR16:$Ra)>;
58245831
}
58255832

58265833
def : Pat<(f32 (node (f32 FPR32:$Rn),
@@ -5835,6 +5842,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
58355842
(!cast<Instruction>(NAME # Srrr)
58365843
(EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>;
58375844

5845+
def : Pat<(f32 (node (f32 (extractelt (v4f32 V128:$Rn), (i64 0))),
5846+
(f32 (extractelt (v4f32 V128:$Rm), (i64 0))),
5847+
(f32 FPR32:$Ra))),
5848+
(!cast<Instruction>(NAME # Srrr)
5849+
(EXTRACT_SUBREG V128:$Rn, ssub),
5850+
(EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>;
5851+
58385852
def : Pat<(f64 (node (f64 FPR64:$Rn),
58395853
(f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
58405854
(f64 FPR64:$Ra))),
@@ -5846,6 +5860,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
58465860
(f64 FPR64:$Ra))),
58475861
(!cast<Instruction>(NAME # Drrr)
58485862
(EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>;
5863+
5864+
def : Pat<(f64 (node (f64 (extractelt (v2f64 V128:$Rn), (i64 0))),
5865+
(f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
5866+
(f64 FPR64:$Ra))),
5867+
(!cast<Instruction>(NAME # Drrr)
5868+
(EXTRACT_SUBREG V128:$Rn, dsub),
5869+
(EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>;
58495870
}
58505871

58515872
//---
@@ -9282,6 +9303,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
92829303
(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
92839304
(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
92849305
V128_lo:$Rm, VectorIndexH:$idx)>;
9306+
def : Pat<(f16 (OpNode (f16 FPR16:$Rd),
9307+
(vector_extract (v8f16 V128:$Rn), (i64 0)),
9308+
(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
9309+
(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd,
9310+
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), V128_lo:$Rm, VectorIndexH:$idx)>;
92859311
} // Predicates = [HasNEON, HasFullFP16]
92869312

92879313
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
@@ -9323,12 +9349,22 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
93239349
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
93249350
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
93259351
V128:$Rm, VectorIndexS:$idx)>;
9352+
def : Pat<(f32 (OpNode (f32 FPR32:$Rd),
9353+
(vector_extract (v4f32 V128:$Rn), (i64 0)),
9354+
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
9355+
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd,
9356+
(f32 (EXTRACT_SUBREG V128:$Rn, ssub)), V128:$Rm, VectorIndexS:$idx)>;
93269357

93279358
// 1 variant for 64-bit scalar version: extract from .1d or from .2d
93289359
def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
93299360
(vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
93309361
(!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
93319362
V128:$Rm, VectorIndexD:$idx)>;
9363+
def : Pat<(f64 (OpNode (f64 FPR64:$Rd),
9364+
(vector_extract (v2f64 V128:$Rn), (i64 0)),
9365+
(vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
9366+
(!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd,
9367+
(f64 (EXTRACT_SUBREG V128:$Rn, dsub)), V128:$Rm, VectorIndexD:$idx)>;
93329368
}
93339369

93349370
let mayRaiseFPException = 1, Uses = [FPCR] in

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
1111
; CHECK-NEXT: mov h2, v0.h[1]
1212
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1313
; CHECK-NEXT: fmul h3, h0, v1.h[1]
14-
; CHECK-NEXT: fmul h4, h2, v1.h[1]
15-
; CHECK-NEXT: fmadd h2, h1, h2, h3
16-
; CHECK-NEXT: fnmsub h0, h1, h0, h4
17-
; CHECK-NEXT: mov v0.h[1], v2.h[0]
14+
; CHECK-NEXT: fmul h2, h2, v1.h[1]
15+
; CHECK-NEXT: fmla h3, h1, v0.h[1]
16+
; CHECK-NEXT: fnmsub h0, h1, h0, h2
17+
; CHECK-NEXT: mov v0.h[1], v3.h[0]
1818
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
1919
; CHECK-NEXT: ret
2020
entry:

llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ define half @t_vfmah_lane_f16_3_0(half %a, <4 x half> %c) {
120120
; CHECK-LABEL: t_vfmah_lane_f16_3_0:
121121
; CHECK: // %bb.0: // %entry
122122
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
123-
; CHECK-NEXT: mov h2, v1.h[3]
124-
; CHECK-NEXT: fmadd h0, h1, h2, h0
123+
; CHECK-NEXT: fmla h0, h1, v1.h[3]
125124
; CHECK-NEXT: ret
126125
entry:
127126
%b = extractelement <4 x half> %c, i32 0
@@ -310,8 +309,7 @@ define half @t_vfmsh_lane_f16_0_3(half %a, <4 x half> %c, i32 %lane) {
310309
; CHECK-LABEL: t_vfmsh_lane_f16_0_3:
311310
; CHECK: // %bb.0: // %entry
312311
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
313-
; CHECK-NEXT: mov h2, v1.h[3]
314-
; CHECK-NEXT: fmsub h0, h2, h1, h0
312+
; CHECK-NEXT: fmls h0, h1, v1.h[3]
315313
; CHECK-NEXT: ret
316314
entry:
317315
%b = extractelement <4 x half> %c, i32 0

llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,7 @@ define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) {
8484
define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
8585
; CHECK-LABEL: test_fmla_ss4S_3_ext0:
8686
; CHECK: // %bb.0:
87-
; CHECK-NEXT: mov s2, v1.s[3]
88-
; CHECK-NEXT: fmadd s0, s1, s2, s0
87+
; CHECK-NEXT: fmla s0, s1, v1.s[3]
8988
; CHECK-NEXT: ret
9089
%tmp0 = extractelement <4 x float> %v, i32 0
9190
%tmp1 = extractelement <4 x float> %v, i32 3
@@ -96,8 +95,7 @@ define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
9695
define float @test_fmla_ss4S_3_ext0_swp(float %a, <4 x float> %v) {
9796
; CHECK-LABEL: test_fmla_ss4S_3_ext0_swp:
9897
; CHECK: // %bb.0:
99-
; CHECK-NEXT: mov s2, v1.s[3]
100-
; CHECK-NEXT: fmadd s0, s2, s1, s0
98+
; CHECK-NEXT: fmla s0, s1, v1.s[3]
10199
; CHECK-NEXT: ret
102100
%tmp0 = extractelement <4 x float> %v, i32 0
103101
%tmp1 = extractelement <4 x float> %v, i32 3
@@ -120,8 +118,7 @@ define float @test_fmla_ss2S_3_ext0(float %a, <2 x float> %v) {
120118
; CHECK-LABEL: test_fmla_ss2S_3_ext0:
121119
; CHECK: // %bb.0:
122120
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
123-
; CHECK-NEXT: mov s2, v1.s[1]
124-
; CHECK-NEXT: fmadd s0, s1, s2, s0
121+
; CHECK-NEXT: fmla s0, s1, v1.s[1]
125122
; CHECK-NEXT: ret
126123
%tmp0 = extractelement <2 x float> %v, i32 0
127124
%tmp1 = extractelement <2 x float> %v, i32 1
@@ -133,8 +130,7 @@ define float @test_fmla_ss2S_3_ext0_swp(float %a, <2 x float> %v) {
133130
; CHECK-LABEL: test_fmla_ss2S_3_ext0_swp:
134131
; CHECK: // %bb.0:
135132
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
136-
; CHECK-NEXT: mov s2, v1.s[1]
137-
; CHECK-NEXT: fmadd s0, s2, s1, s0
133+
; CHECK-NEXT: fmla s0, s1, v1.s[1]
138134
; CHECK-NEXT: ret
139135
%tmp0 = extractelement <2 x float> %v, i32 0
140136
%tmp1 = extractelement <2 x float> %v, i32 1
@@ -218,8 +214,7 @@ define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
218214
define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
219215
; CHECK-LABEL: test_fmla_ss2D_1_ext0:
220216
; CHECK: // %bb.0:
221-
; CHECK-NEXT: mov d2, v1.d[1]
222-
; CHECK-NEXT: fmadd d0, d1, d2, d0
217+
; CHECK-NEXT: fmla d0, d1, v1.d[1]
223218
; CHECK-NEXT: ret
224219
%tmp0 = extractelement <2 x double> %v, i32 0
225220
%tmp1 = extractelement <2 x double> %v, i32 1
@@ -230,8 +225,7 @@ define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
230225
define double @test_fmla_ss2D_1_ext0_swp(double %a, <2 x double> %v) {
231226
; CHECK-LABEL: test_fmla_ss2D_1_ext0_swp:
232227
; CHECK: // %bb.0:
233-
; CHECK-NEXT: mov d2, v1.d[1]
234-
; CHECK-NEXT: fmadd d0, d2, d1, d0
228+
; CHECK-NEXT: fmla d0, d1, v1.d[1]
235229
; CHECK-NEXT: ret
236230
%tmp0 = extractelement <2 x double> %v, i32 0
237231
%tmp1 = extractelement <2 x double> %v, i32 1
@@ -340,8 +334,7 @@ define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) {
340334
define float @test_fmls_ss4S_3_ext0(float %a, <4 x float> %v) {
341335
; CHECK-LABEL: test_fmls_ss4S_3_ext0:
342336
; CHECK: // %bb.0:
343-
; CHECK-NEXT: mov s2, v1.s[3]
344-
; CHECK-NEXT: fmsub s0, s1, s2, s0
337+
; CHECK-NEXT: fmls s0, s1, v1.s[3]
345338
; CHECK-NEXT: ret
346339
%tmp0 = extractelement <4 x float> %v, i32 0
347340
%tmp1 = extractelement <4 x float> %v, i32 3
@@ -437,8 +430,7 @@ define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
437430
define double @test_fmls_dd2D_1_ext0(double %a, <2 x double> %v) {
438431
; CHECK-LABEL: test_fmls_dd2D_1_ext0:
439432
; CHECK: // %bb.0:
440-
; CHECK-NEXT: mov d2, v1.d[1]
441-
; CHECK-NEXT: fmsub d0, d1, d2, d0
433+
; CHECK-NEXT: fmls d0, d1, v1.d[1]
442434
; CHECK-NEXT: ret
443435
%tmp0 = extractelement <2 x double> %v, i32 0
444436
%tmp1 = extractelement <2 x double> %v, i32 1

0 commit comments

Comments
 (0)