Skip to content

Commit fc8a77e

Browse files
[LLVM][CodeGen][SVE] Prefer NEON instructions when zeroing Z registers.
Several implementations have zero-latency instructions to zero registers. To-date no implementation has a dedicated SVE instruction but we can use the NEON equivalent because it is defined to zero bits 128..VL regardless of the immediate used. NOTE: The relevant instruction is not available in streaming mode, where the original SVE DUP instruction remains in use.
1 parent 6892d54 commit fc8a77e

File tree

56 files changed

+2519
-1377
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+2519
-1377
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7714,6 +7714,7 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
77147714
"movi", ".2d",
77157715
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
77167716

7717+
let Predicates = [HasNEON] in {
77177718
def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
77187719
def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
77197720
def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
@@ -7723,6 +7724,23 @@ def : Pat<(v4f32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
77237724
def : Pat<(v8f16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
77247725
def : Pat<(v8bf16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
77257726

7727+
// Prefer NEON instructions when zeroing ZPRs because they are potentially zero-latency.
7728+
let AddedComplexity = 5 in {
7729+
def : Pat<(nxv2i64 (splat_vector (i64 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7730+
def : Pat<(nxv4i32 (splat_vector (i32 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7731+
def : Pat<(nxv8i16 (splat_vector (i32 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7732+
def : Pat<(nxv16i8 (splat_vector (i32 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7733+
def : Pat<(nxv2f64 (splat_vector (f64 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7734+
def : Pat<(nxv2f32 (splat_vector (f32 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7735+
def : Pat<(nxv4f32 (splat_vector (f32 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7736+
def : Pat<(nxv2f16 (splat_vector (f16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7737+
def : Pat<(nxv4f16 (splat_vector (f16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7738+
def : Pat<(nxv8f16 (splat_vector (f16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7739+
def : Pat<(nxv2bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7740+
def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7741+
def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
7742+
}
7743+
77267744
def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
77277745
def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
77287746
def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
@@ -7743,6 +7761,7 @@ def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
77437761
def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
77447762
def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
77457763
def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
7764+
}
77467765

77477766
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
77487767
let isReMaterializable = 1, isAsCheapAsAMove = 1 in

llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,21 @@ entry:
5050
define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
5151
; CHECK-LABEL: mul_add_mull:
5252
; CHECK: // %bb.0: // %entry
53-
; CHECK-NEXT: mov z24.d, #0 // =0x0
53+
; CHECK-NEXT: movi v24.2d, #0000000000000000
54+
; CHECK-NEXT: movi v25.2d, #0000000000000000
55+
; CHECK-NEXT: movi v26.2d, #0000000000000000
56+
; CHECK-NEXT: movi v27.2d, #0000000000000000
5457
; CHECK-NEXT: ptrue p0.d
55-
; CHECK-NEXT: mov z25.d, z24.d
56-
; CHECK-NEXT: mov z26.d, z24.d
57-
; CHECK-NEXT: mov z27.d, z24.d
58-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
59-
; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0
60-
; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0
58+
; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #0
59+
; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #0
6160
; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0
62-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
63-
; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90
64-
; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90
61+
; CHECK-NEXT: fcmla z26.d, p0/m, z7.d, z5.d, #0
62+
; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #90
63+
; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #90
6564
; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90
66-
; CHECK-NEXT: fadd z1.d, z26.d, z24.d
67-
; CHECK-NEXT: fadd z0.d, z25.d, z27.d
65+
; CHECK-NEXT: fcmla z26.d, p0/m, z7.d, z5.d, #90
66+
; CHECK-NEXT: fadd z0.d, z24.d, z27.d
67+
; CHECK-NEXT: fadd z1.d, z25.d, z26.d
6868
; CHECK-NEXT: ret
6969
entry:
7070
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -101,21 +101,21 @@ entry:
101101
define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
102102
; CHECK-LABEL: mul_sub_mull:
103103
; CHECK: // %bb.0: // %entry
104-
; CHECK-NEXT: mov z24.d, #0 // =0x0
104+
; CHECK-NEXT: movi v24.2d, #0000000000000000
105+
; CHECK-NEXT: movi v25.2d, #0000000000000000
106+
; CHECK-NEXT: movi v26.2d, #0000000000000000
107+
; CHECK-NEXT: movi v27.2d, #0000000000000000
105108
; CHECK-NEXT: ptrue p0.d
106-
; CHECK-NEXT: mov z25.d, z24.d
107-
; CHECK-NEXT: mov z26.d, z24.d
108-
; CHECK-NEXT: mov z27.d, z24.d
109-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
110-
; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0
111-
; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0
109+
; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #0
110+
; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #0
112111
; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0
113-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
114-
; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90
115-
; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90
112+
; CHECK-NEXT: fcmla z26.d, p0/m, z7.d, z5.d, #0
113+
; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #90
114+
; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #90
116115
; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90
117-
; CHECK-NEXT: fsub z1.d, z26.d, z24.d
118-
; CHECK-NEXT: fsub z0.d, z25.d, z27.d
116+
; CHECK-NEXT: fcmla z26.d, p0/m, z7.d, z5.d, #90
117+
; CHECK-NEXT: fsub z0.d, z24.d, z27.d
118+
; CHECK-NEXT: fsub z1.d, z25.d, z26.d
119119
; CHECK-NEXT: ret
120120
entry:
121121
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -152,21 +152,21 @@ entry:
152152
define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
153153
; CHECK-LABEL: mul_conj_mull:
154154
; CHECK: // %bb.0: // %entry
155-
; CHECK-NEXT: mov z24.d, #0 // =0x0
155+
; CHECK-NEXT: movi v24.2d, #0000000000000000
156+
; CHECK-NEXT: movi v25.2d, #0000000000000000
157+
; CHECK-NEXT: movi v26.2d, #0000000000000000
158+
; CHECK-NEXT: movi v27.2d, #0000000000000000
156159
; CHECK-NEXT: ptrue p0.d
157-
; CHECK-NEXT: mov z25.d, z24.d
158-
; CHECK-NEXT: mov z26.d, z24.d
159-
; CHECK-NEXT: mov z27.d, z24.d
160-
; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0
161-
; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0
162-
; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0
160+
; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #0
161+
; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #0
163162
; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #0
164-
; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270
165-
; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90
166-
; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90
163+
; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z7.d, #0
164+
; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #90
165+
; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #90
167166
; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #270
168-
; CHECK-NEXT: fadd z1.d, z26.d, z24.d
169-
; CHECK-NEXT: fadd z0.d, z25.d, z27.d
167+
; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z7.d, #270
168+
; CHECK-NEXT: fadd z0.d, z24.d, z27.d
169+
; CHECK-NEXT: fadd z1.d, z25.d, z26.d
170170
; CHECK-NEXT: ret
171171
entry:
172172
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -204,7 +204,7 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
204204
; CHECK-LABEL: mul_add_rot_mull:
205205
; CHECK: // %bb.0: // %entry
206206
; CHECK-NEXT: uzp2 z24.d, z4.d, z5.d
207-
; CHECK-NEXT: mov z25.d, #0 // =0x0
207+
; CHECK-NEXT: movi v25.2d, #0000000000000000
208208
; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
209209
; CHECK-NEXT: ptrue p0.d
210210
; CHECK-NEXT: mov z26.d, z24.d

llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,19 @@ entry:
4141
define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
4242
; CHECK-LABEL: mul_add_mull:
4343
; CHECK: // %bb.0: // %entry
44-
; CHECK-NEXT: mov z24.d, #0 // =0x0
44+
; CHECK-NEXT: movi v24.2d, #0000000000000000
45+
; CHECK-NEXT: movi v25.2d, #0000000000000000
4546
; CHECK-NEXT: ptrue p0.d
46-
; CHECK-NEXT: mov z25.d, z24.d
47-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
4847
; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0
49-
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
48+
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
5049
; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0
51-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
50+
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
5251
; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90
53-
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
52+
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
5453
; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90
55-
; CHECK-NEXT: mov z1.d, z24.d
54+
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
5655
; CHECK-NEXT: mov z0.d, z25.d
56+
; CHECK-NEXT: mov z1.d, z24.d
5757
; CHECK-NEXT: ret
5858
entry:
5959
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -90,19 +90,19 @@ entry:
9090
define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
9191
; CHECK-LABEL: mul_sub_mull:
9292
; CHECK: // %bb.0: // %entry
93-
; CHECK-NEXT: mov z24.d, #0 // =0x0
93+
; CHECK-NEXT: movi v24.2d, #0000000000000000
94+
; CHECK-NEXT: movi v25.2d, #0000000000000000
9495
; CHECK-NEXT: ptrue p0.d
95-
; CHECK-NEXT: mov z25.d, z24.d
96-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270
9796
; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270
98-
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
97+
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270
9998
; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0
100-
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180
99+
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
101100
; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180
102-
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
101+
; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180
103102
; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90
104-
; CHECK-NEXT: mov z1.d, z24.d
103+
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
105104
; CHECK-NEXT: mov z0.d, z25.d
105+
; CHECK-NEXT: mov z1.d, z24.d
106106
; CHECK-NEXT: ret
107107
entry:
108108
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -139,19 +139,19 @@ entry:
139139
define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
140140
; CHECK-LABEL: mul_conj_mull:
141141
; CHECK: // %bb.0: // %entry
142-
; CHECK-NEXT: mov z24.d, #0 // =0x0
142+
; CHECK-NEXT: movi v24.2d, #0000000000000000
143+
; CHECK-NEXT: movi v25.2d, #0000000000000000
143144
; CHECK-NEXT: ptrue p0.d
144-
; CHECK-NEXT: mov z25.d, z24.d
145-
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
146145
; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0
147-
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
146+
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
148147
; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90
149-
; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0
148+
; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
150149
; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0
151-
; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270
150+
; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0
152151
; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270
153-
; CHECK-NEXT: mov z1.d, z24.d
152+
; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270
154153
; CHECK-NEXT: mov z0.d, z25.d
154+
; CHECK-NEXT: mov z1.d, z24.d
155155
; CHECK-NEXT: ret
156156
entry:
157157
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ entry:
4646
define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
4747
; CHECK-LABEL: complex_mul_v8f16:
4848
; CHECK: // %bb.0: // %entry
49-
; CHECK-NEXT: mov z2.h, #0 // =0x0
49+
; CHECK-NEXT: movi v2.2d, #0000000000000000
5050
; CHECK-NEXT: ptrue p0.h
5151
; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #0
5252
; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #90
@@ -72,15 +72,15 @@ entry:
7272
define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b) {
7373
; CHECK-LABEL: complex_mul_v16f16:
7474
; CHECK: // %bb.0: // %entry
75-
; CHECK-NEXT: mov z4.h, #0 // =0x0
75+
; CHECK-NEXT: movi v4.2d, #0000000000000000
76+
; CHECK-NEXT: movi v5.2d, #0000000000000000
7677
; CHECK-NEXT: ptrue p0.h
77-
; CHECK-NEXT: mov z5.d, z4.d
78-
; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #0
7978
; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0
80-
; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #90
79+
; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #0
8180
; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #90
82-
; CHECK-NEXT: mov z1.d, z4.d
81+
; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #90
8382
; CHECK-NEXT: mov z0.d, z5.d
83+
; CHECK-NEXT: mov z1.d, z4.d
8484
; CHECK-NEXT: ret
8585
entry:
8686
%a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
@@ -103,23 +103,23 @@ entry:
103103
define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
104104
; CHECK-LABEL: complex_mul_v32f16:
105105
; CHECK: // %bb.0: // %entry
106-
; CHECK-NEXT: mov z24.h, #0 // =0x0
106+
; CHECK-NEXT: movi v24.2d, #0000000000000000
107+
; CHECK-NEXT: movi v25.2d, #0000000000000000
108+
; CHECK-NEXT: movi v26.2d, #0000000000000000
109+
; CHECK-NEXT: movi v27.2d, #0000000000000000
107110
; CHECK-NEXT: ptrue p0.h
108-
; CHECK-NEXT: mov z25.d, z24.d
109-
; CHECK-NEXT: mov z26.d, z24.d
110-
; CHECK-NEXT: mov z27.d, z24.d
111-
; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #0
112-
; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #0
113-
; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #0
111+
; CHECK-NEXT: fcmla z24.h, p0/m, z4.h, z0.h, #0
112+
; CHECK-NEXT: fcmla z25.h, p0/m, z5.h, z1.h, #0
114113
; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #0
115-
; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #90
116-
; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #90
117-
; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #90
114+
; CHECK-NEXT: fcmla z26.h, p0/m, z7.h, z3.h, #0
115+
; CHECK-NEXT: fcmla z24.h, p0/m, z4.h, z0.h, #90
116+
; CHECK-NEXT: fcmla z25.h, p0/m, z5.h, z1.h, #90
118117
; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #90
119-
; CHECK-NEXT: mov z3.d, z24.d
120-
; CHECK-NEXT: mov z0.d, z25.d
121-
; CHECK-NEXT: mov z1.d, z26.d
118+
; CHECK-NEXT: fcmla z26.h, p0/m, z7.h, z3.h, #90
119+
; CHECK-NEXT: mov z0.d, z24.d
120+
; CHECK-NEXT: mov z1.d, z25.d
122121
; CHECK-NEXT: mov z2.d, z27.d
122+
; CHECK-NEXT: mov z3.d, z26.d
123123
; CHECK-NEXT: ret
124124
entry:
125125
%a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)

llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target triple = "aarch64"
77
define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
88
; CHECK-LABEL: complex_mul_v4f32:
99
; CHECK: // %bb.0: // %entry
10-
; CHECK-NEXT: mov z2.s, #0 // =0x0
10+
; CHECK-NEXT: movi v2.2d, #0000000000000000
1111
; CHECK-NEXT: ptrue p0.s
1212
; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #0
1313
; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #90
@@ -34,15 +34,15 @@ entry:
3434
define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b) {
3535
; CHECK-LABEL: complex_mul_v8f32:
3636
; CHECK: // %bb.0: // %entry
37-
; CHECK-NEXT: mov z4.s, #0 // =0x0
37+
; CHECK-NEXT: movi v4.2d, #0000000000000000
38+
; CHECK-NEXT: movi v5.2d, #0000000000000000
3839
; CHECK-NEXT: ptrue p0.s
39-
; CHECK-NEXT: mov z5.d, z4.d
40-
; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #0
4140
; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0
42-
; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #90
41+
; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #0
4342
; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #90
44-
; CHECK-NEXT: mov z1.d, z4.d
43+
; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #90
4544
; CHECK-NEXT: mov z0.d, z5.d
45+
; CHECK-NEXT: mov z1.d, z4.d
4646
; CHECK-NEXT: ret
4747
entry:
4848
%a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
@@ -65,23 +65,23 @@ entry:
6565
define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
6666
; CHECK-LABEL: complex_mul_v16f32:
6767
; CHECK: // %bb.0: // %entry
68-
; CHECK-NEXT: mov z24.s, #0 // =0x0
68+
; CHECK-NEXT: movi v24.2d, #0000000000000000
69+
; CHECK-NEXT: movi v25.2d, #0000000000000000
70+
; CHECK-NEXT: movi v26.2d, #0000000000000000
71+
; CHECK-NEXT: movi v27.2d, #0000000000000000
6972
; CHECK-NEXT: ptrue p0.s
70-
; CHECK-NEXT: mov z25.d, z24.d
71-
; CHECK-NEXT: mov z26.d, z24.d
72-
; CHECK-NEXT: mov z27.d, z24.d
73-
; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #0
74-
; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #0
75-
; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #0
73+
; CHECK-NEXT: fcmla z24.s, p0/m, z4.s, z0.s, #0
74+
; CHECK-NEXT: fcmla z25.s, p0/m, z5.s, z1.s, #0
7675
; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #0
77-
; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #90
78-
; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #90
79-
; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #90
76+
; CHECK-NEXT: fcmla z26.s, p0/m, z7.s, z3.s, #0
77+
; CHECK-NEXT: fcmla z24.s, p0/m, z4.s, z0.s, #90
78+
; CHECK-NEXT: fcmla z25.s, p0/m, z5.s, z1.s, #90
8079
; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #90
81-
; CHECK-NEXT: mov z3.d, z24.d
82-
; CHECK-NEXT: mov z0.d, z25.d
83-
; CHECK-NEXT: mov z1.d, z26.d
80+
; CHECK-NEXT: fcmla z26.s, p0/m, z7.s, z3.s, #90
81+
; CHECK-NEXT: mov z0.d, z24.d
82+
; CHECK-NEXT: mov z1.d, z25.d
8483
; CHECK-NEXT: mov z2.d, z27.d
84+
; CHECK-NEXT: mov z3.d, z26.d
8585
; CHECK-NEXT: ret
8686
entry:
8787
%a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)

0 commit comments

Comments
 (0)