Skip to content

Commit ea045b9

Browse files
overmightydavemgreen
authored andcommitted
[AArch64] Add patterns for scalar FMUL, FMULX
Scalar FMUL, FMULX instructions perform better or the same compared to indexed FMUL, FMULX. For example, the Arm Cortex-A55 Software Optimization Guide lists the following instructions with a throughput of 2 IPC: - "FP multiply" FMUL - "ASIMD FP multiply" FMULX whereas it lists the following with a throughput of 1 IPC: - "ASIMD FP multiply, by element" FMUL, FMULX The Arm Cortex-A510 Software Optimization Guide, however, does not separately list "by element" variants of the "ASIMD FP multiply" instructions, which are listed with the same throughput as the non-ASIMD ones. Fixes #60817. Differential Revision: https://reviews.llvm.org/D153207
1 parent 0446bfc commit ea045b9

9 files changed

+267
-39
lines changed

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8427,9 +8427,9 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
84278427
V128, v4f32, v8f16, OpNode>;
84288428
}
84298429

8430-
let mayRaiseFPException = 1, Uses = [FPCR] in
84318430
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
84328431
SDPatternOperator OpNode> {
8432+
let mayRaiseFPException = 1, Uses = [FPCR] in {
84338433
let Predicates = [HasNEON, HasFullFP16] in {
84348434
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
84358435
V64, V64,
@@ -8532,6 +8532,29 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
85328532
let Inst{11} = idx{0};
85338533
let Inst{21} = 0;
85348534
}
8535+
} // mayRaiseFPException = 1, Uses = [FPCR]
8536+
8537+
let Predicates = [HasNEON, HasFullFP16] in {
8538+
def : Pat<(f16 (OpNode
8539+
(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
8540+
(f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))),
8541+
(!cast<Instruction>(NAME # v1i16_indexed)
8542+
(EXTRACT_SUBREG V128:$Rn, hsub), V128:$Rm, VectorIndexH:$idx)>;
8543+
}
8544+
8545+
let Predicates = [HasNEON] in {
8546+
def : Pat<(f32 (OpNode
8547+
(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
8548+
(f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx)))),
8549+
(!cast<Instruction>(NAME # v1i32_indexed)
8550+
(EXTRACT_SUBREG V128:$Rn, ssub), V128:$Rm, VectorIndexS:$idx)>;
8551+
8552+
def : Pat<(f64 (OpNode
8553+
(f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
8554+
(f64 (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx)))),
8555+
(!cast<Instruction>(NAME # v1i64_indexed)
8556+
(EXTRACT_SUBREG V128:$Rn, dsub), V128:$Rm, VectorIndexD:$idx)>;
8557+
}
85358558
}
85368559

85378560
multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4443,6 +4443,33 @@ defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>;
44434443
}
44444444
defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>;
44454445

4446+
multiclass FMULScalarFromIndexedLane0Patterns<string inst,
4447+
string inst_f16_suffix,
4448+
string inst_f32_suffix,
4449+
string inst_f64_suffix,
4450+
SDPatternOperator OpNode,
4451+
list<Predicate> preds = []> {
4452+
let Predicates = !listconcat(preds, [HasFullFP16]) in {
4453+
def : Pat<(f16 (OpNode (f16 FPR16:$Rn),
4454+
(f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))),
4455+
(!cast<Instruction>(inst # inst_f16_suffix)
4456+
FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>;
4457+
}
4458+
let Predicates = preds in {
4459+
def : Pat<(f32 (OpNode (f32 FPR32:$Rn),
4460+
(f32 (vector_extract (v4f32 V128:$Rm), (i64 0))))),
4461+
(!cast<Instruction>(inst # inst_f32_suffix)
4462+
FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>;
4463+
def : Pat<(f64 (OpNode (f64 FPR64:$Rn),
4464+
(f64 (vector_extract (v2f64 V128:$Rm), (i64 0))))),
4465+
(!cast<Instruction>(inst # inst_f64_suffix)
4466+
FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>;
4467+
}
4468+
}
4469+
4470+
defm : FMULScalarFromIndexedLane0Patterns<"FMUL", "Hrr", "Srr", "Drr",
4471+
any_fmul>;
4472+
44464473
// Match reassociated forms of FNMUL.
44474474
def : Pat<(fmul (fneg FPR16:$a), (f16 FPR16:$b)),
44484475
(FNMULHrr FPR16:$a, FPR16:$b)>,
@@ -5248,6 +5275,10 @@ let Predicates = [HasRDM] in {
52485275
(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
52495276
}
52505277

5278+
defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
5279+
int_aarch64_neon_fmulx,
5280+
[HasNEONorSME]>;
5281+
52515282
def : InstAlias<"cmls $dst, $src1, $src2",
52525283
(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
52535284
def : InstAlias<"cmle $dst, $src1, $src2",

llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ entry:
1717
; CHECK-LABEL: %for.body
1818
; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
1919
; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
20-
; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
20+
; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
2121
for.body: ; preds = %for.body, %entry
2222
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
2323
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -59,7 +59,7 @@ entry:
5959
; CHECK-LABEL: %for.body
6060
; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
6161
; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
62-
; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
62+
; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
6363
for.body: ; preds = %for.body, %entry
6464
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
6565
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

llvm/test/CodeGen/AArch64/arm64-fml-combines.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ entry:
1010
; CHECK-LABEL: %for.body
1111
; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
1212
; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
13-
; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
13+
; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
1414
for.body: ; preds = %for.body, %entry
1515
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1616
%indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
@@ -52,7 +52,7 @@ entry:
5252
; CHECK-LABEL: %for.body
5353
; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
5454
; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
55-
; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
55+
; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
5656
for.body: ; preds = %for.body, %entry
5757
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
5858
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
88

99
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
1010

11+
declare double @llvm.aarch64.neon.fmulx.f64(double, double)
12+
1113
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
1214
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
1315
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
@@ -2066,6 +2068,19 @@ entry:
20662068
ret <4 x float> %vmulx2.i
20672069
}
20682070

2071+
define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %v) {
2072+
; CHECK-LABEL: test_vmulx_lane_f64:
2073+
; CHECK: // %bb.0: // %entry
2074+
; CHECK-NEXT: fmulx d0, d0, d1
2075+
; CHECK-NEXT: ret
2076+
entry:
2077+
%vget_lane = extractelement <1 x double> %a, i64 0
2078+
%vget_lane3 = extractelement <1 x double> %v, i64 0
2079+
%vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vget_lane3)
2080+
%vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
2081+
ret <1 x double> %vset_lane
2082+
}
2083+
20692084
define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
20702085
; CHECK-LABEL: test_vmulxq_lane_f64:
20712086
; CHECK: // %bb.0: // %entry
@@ -2100,6 +2115,19 @@ entry:
21002115
ret <4 x float> %vmulx2.i
21012116
}
21022117

2118+
define <1 x double> @test_vmulx_laneq_f64(<1 x double> %a, <2 x double> %v) {
2119+
; CHECK-LABEL: test_vmulx_laneq_f64:
2120+
; CHECK: // %bb.0: // %entry
2121+
; CHECK-NEXT: fmulx d0, d0, v1.d[1]
2122+
; CHECK-NEXT: ret
2123+
entry:
2124+
%vget_lane = extractelement <1 x double> %a, i64 0
2125+
%vgetq_lane = extractelement <2 x double> %v, i64 1
2126+
%vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane)
2127+
%vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
2128+
ret <1 x double> %vset_lane
2129+
}
2130+
21032131
define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
21042132
; CHECK-LABEL: test_vmulxq_laneq_f64:
21052133
; CHECK: // %bb.0: // %entry
@@ -3560,7 +3588,7 @@ entry:
35603588
define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
35613589
; CHECK-LABEL: test_vmul_laneq_f64_0:
35623590
; CHECK: // %bb.0: // %entry
3563-
; CHECK-NEXT: fmul d0, d0, v1.d[0]
3591+
; CHECK-NEXT: fmul d0, d0, d1
35643592
; CHECK-NEXT: ret
35653593
entry:
35663594
%0 = bitcast <1 x double> %a to <8 x i8>
@@ -3651,6 +3679,19 @@ entry:
36513679
ret <4 x float> %vmulx2.i
36523680
}
36533681

3682+
define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
3683+
; CHECK-LABEL: test_vmulx_laneq_f64_0:
3684+
; CHECK: // %bb.0: // %entry
3685+
; CHECK-NEXT: fmulx d0, d0, d1
3686+
; CHECK-NEXT: ret
3687+
entry:
3688+
%vget_lane = extractelement <1 x double> %a, i64 0
3689+
%vgetq_lane = extractelement <2 x double> %v, i64 0
3690+
%vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane)
3691+
%vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
3692+
ret <1 x double> %vset_lane
3693+
}
3694+
36543695
define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
36553696
; CHECK-LABEL: test_vmulxq_laneq_f64_0:
36563697
; CHECK: // %bb.0: // %entry

llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll

Lines changed: 93 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
22
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
33

4-
define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
5-
; CHECK-LABEL: test_fmul_lane_ss2S:
4+
define float @test_fmul_lane_ss2S_0(float %a, <2 x float> %v) {
5+
; CHECK-LABEL: test_fmul_lane_ss2S_0:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
8+
; CHECK-NEXT: fmul s0, s0, s1
9+
; CHECK-NEXT: ret
10+
%tmp1 = extractelement <2 x float> %v, i32 0
11+
%tmp2 = fmul float %a, %tmp1
12+
ret float %tmp2
13+
}
14+
15+
define float @test_fmul_lane_ss2S_1(float %a, <2 x float> %v) {
16+
; CHECK-LABEL: test_fmul_lane_ss2S_1:
617
; CHECK: // %bb.0:
718
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
819
; CHECK-NEXT: fmul s0, s0, v1.s[1]
@@ -12,8 +23,8 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
1223
ret float %tmp2;
1324
}
1425

15-
define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
16-
; CHECK-LABEL: test_fmul_lane_ss2S_swap:
26+
define float @test_fmul_lane_ss2S_1_swap(float %a, <2 x float> %v) {
27+
; CHECK-LABEL: test_fmul_lane_ss2S_1_swap:
1728
; CHECK: // %bb.0:
1829
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1930
; CHECK-NEXT: fmul s0, s0, v1.s[1]
@@ -23,9 +34,18 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
2334
ret float %tmp2;
2435
}
2536

37+
define float @test_fmul_lane_ss4S_0(float %a, <4 x float> %v) {
38+
; CHECK-LABEL: test_fmul_lane_ss4S_0:
39+
; CHECK: // %bb.0:
40+
; CHECK-NEXT: fmul s0, s0, s1
41+
; CHECK-NEXT: ret
42+
%tmp1 = extractelement <4 x float> %v, i32 0
43+
%tmp2 = fmul float %a, %tmp1
44+
ret float %tmp2
45+
}
2646

27-
define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
28-
; CHECK-LABEL: test_fmul_lane_ss4S:
47+
define float @test_fmul_lane_ss4S_3(float %a, <4 x float> %v) {
48+
; CHECK-LABEL: test_fmul_lane_ss4S_3:
2949
; CHECK: // %bb.0:
3050
; CHECK-NEXT: fmul s0, s0, v1.s[3]
3151
; CHECK-NEXT: ret
@@ -34,8 +54,8 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
3454
ret float %tmp2;
3555
}
3656

37-
define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
38-
; CHECK-LABEL: test_fmul_lane_ss4S_swap:
57+
define float @test_fmul_lane_ss4S_3_swap(float %a, <4 x float> %v) {
58+
; CHECK-LABEL: test_fmul_lane_ss4S_3_swap:
3959
; CHECK: // %bb.0:
4060
; CHECK-NEXT: fmul s0, s0, v1.s[3]
4161
; CHECK-NEXT: ret
@@ -56,9 +76,18 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
5676
}
5777

5878

79+
define double @test_fmul_lane_dd2D_0(double %a, <2 x double> %v) {
80+
; CHECK-LABEL: test_fmul_lane_dd2D_0:
81+
; CHECK: // %bb.0:
82+
; CHECK-NEXT: fmul d0, d0, d1
83+
; CHECK-NEXT: ret
84+
%tmp1 = extractelement <2 x double> %v, i32 0
85+
%tmp2 = fmul double %a, %tmp1
86+
ret double %tmp2
87+
}
5988

60-
define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
61-
; CHECK-LABEL: test_fmul_lane_dd2D:
89+
define double @test_fmul_lane_dd2D_1(double %a, <2 x double> %v) {
90+
; CHECK-LABEL: test_fmul_lane_dd2D_1:
6291
; CHECK: // %bb.0:
6392
; CHECK-NEXT: fmul d0, d0, v1.d[1]
6493
; CHECK-NEXT: ret
@@ -68,8 +97,8 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
6897
}
6998

7099

71-
define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
72-
; CHECK-LABEL: test_fmul_lane_dd2D_swap:
100+
define double @test_fmul_lane_dd2D_1_swap(double %a, <2 x double> %v) {
101+
; CHECK-LABEL: test_fmul_lane_dd2D_1_swap:
73102
; CHECK: // %bb.0:
74103
; CHECK-NEXT: fmul d0, d0, v1.d[1]
75104
; CHECK-NEXT: ret
@@ -80,8 +109,19 @@ define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
80109

81110
declare float @llvm.aarch64.neon.fmulx.f32(float, float)
82111

83-
define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
84-
; CHECK-LABEL: test_fmulx_lane_f32:
112+
define float @test_fmulx_lane_f32_0(float %a, <2 x float> %v) {
113+
; CHECK-LABEL: test_fmulx_lane_f32_0:
114+
; CHECK: // %bb.0:
115+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
116+
; CHECK-NEXT: fmulx s0, s0, s1
117+
; CHECK-NEXT: ret
118+
%tmp1 = extractelement <2 x float> %v, i32 0
119+
%tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
120+
ret float %tmp2;
121+
}
122+
123+
define float @test_fmulx_lane_f32_1(float %a, <2 x float> %v) {
124+
; CHECK-LABEL: test_fmulx_lane_f32_1:
85125
; CHECK: // %bb.0:
86126
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
87127
; CHECK-NEXT: fmulx s0, s0, v1.s[1]
@@ -91,8 +131,18 @@ define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
91131
ret float %tmp2;
92132
}
93133

94-
define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
95-
; CHECK-LABEL: test_fmulx_laneq_f32:
134+
define float @test_fmulx_laneq_f32_0(float %a, <4 x float> %v) {
135+
; CHECK-LABEL: test_fmulx_laneq_f32_0:
136+
; CHECK: // %bb.0:
137+
; CHECK-NEXT: fmulx s0, s0, s1
138+
; CHECK-NEXT: ret
139+
%tmp1 = extractelement <4 x float> %v, i32 0
140+
%tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
141+
ret float %tmp2;
142+
}
143+
144+
define float @test_fmulx_laneq_f32_3(float %a, <4 x float> %v) {
145+
; CHECK-LABEL: test_fmulx_laneq_f32_3:
96146
; CHECK: // %bb.0:
97147
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
98148
; CHECK-NEXT: ret
@@ -101,8 +151,8 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
101151
ret float %tmp2;
102152
}
103153

104-
define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
105-
; CHECK-LABEL: test_fmulx_laneq_f32_swap:
154+
define float @test_fmulx_laneq_f32_3_swap(float %a, <4 x float> %v) {
155+
; CHECK-LABEL: test_fmulx_laneq_f32_3_swap:
106156
; CHECK: // %bb.0:
107157
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
108158
; CHECK-NEXT: ret
@@ -126,7 +176,7 @@ define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
126176
define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
127177
; CHECK-LABEL: test_fmulx_laneq_f64_0:
128178
; CHECK: // %bb.0:
129-
; CHECK-NEXT: fmulx d0, d0, v1.d[0]
179+
; CHECK-NEXT: fmulx d0, d0, d1
130180
; CHECK-NEXT: ret
131181
%tmp1 = extractelement <2 x double> %v, i32 0
132182
%tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
@@ -154,3 +204,27 @@ define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
154204
ret double %tmp2;
155205
}
156206

207+
define float @test_fmulx_horizontal_f32(<2 x float> %v) {
208+
; CHECK-LABEL: test_fmulx_horizontal_f32:
209+
; CHECK: // %bb.0: // %entry
210+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
211+
; CHECK-NEXT: fmulx s0, s0, v0.s[1]
212+
; CHECK-NEXT: ret
213+
entry:
214+
%0 = extractelement <2 x float> %v, i32 0
215+
%1 = extractelement <2 x float> %v, i32 1
216+
%2 = call float @llvm.aarch64.neon.fmulx.f32(float %0, float %1)
217+
ret float %2
218+
}
219+
220+
define double @test_fmulx_horizontal_f64(<2 x double> %v) {
221+
; CHECK-LABEL: test_fmulx_horizontal_f64:
222+
; CHECK: // %bb.0: // %entry
223+
; CHECK-NEXT: fmulx d0, d0, v0.d[1]
224+
; CHECK-NEXT: ret
225+
entry:
226+
%0 = extractelement <2 x double> %v, i32 0
227+
%1 = extractelement <2 x double> %v, i32 1
228+
%2 = call double @llvm.aarch64.neon.fmulx.f64(double %0, double %1)
229+
ret double %2
230+
}

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
1111
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1212
; CHECK-NEXT: mov h3, v0.h[1]
1313
; CHECK-NEXT: mov h2, v1.h[1]
14-
; CHECK-NEXT: fmul h4, h2, v0.h[0]
14+
; CHECK-NEXT: fmul h4, h0, v1.h[1]
1515
; CHECK-NEXT: fnmul h2, h3, h2
1616
; CHECK-NEXT: fmla h4, h3, v1.h[0]
1717
; CHECK-NEXT: fmla h2, h0, v1.h[0]

0 commit comments

Comments
 (0)