1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
- ; RUN: llc -mtriple=aarch64-none- linux-gnu -mattr=+sve -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
2
+ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
3
3
4
4
; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
5
5
; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
6
6
; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
7
7
; some cases as it can lead to worse codegen.
8
8
9
- define void @consecutive_stores_pair (ptr noalias %dest0 , ptr noalias %src0 ) {
9
+ ; TODO: A single `stp s0, s1, [x0]` may be preferred here.
10
+ define void @consecutive_stores_pair (ptr %dest0 , <vscale x 4 x float > %vec0 , <vscale x 4 x float > %vec1 ) {
10
11
; CHECK-LABEL: consecutive_stores_pair:
11
12
; CHECK: // %bb.0:
12
13
; CHECK-NEXT: ptrue p0.s
13
- ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1]
14
- ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, #1, mul vl]
15
14
; CHECK-NEXT: faddv s0, p0, z0.s
16
15
; CHECK-NEXT: faddv s1, p0, z1.s
17
16
; CHECK-NEXT: mov v0.s[1], v1.s[0]
18
17
; CHECK-NEXT: str d0, [x0]
19
18
; CHECK-NEXT: ret
20
- %ptrue = call <vscale x 4 x i1 > @llvm.aarch64.sve.ptrue.nxv4i1 (i32 31 )
21
- %vscale = call i64 @llvm.vscale.i64 ()
22
- %c4_vscale = shl i64 %vscale , 2
23
- %src1 = getelementptr inbounds float , ptr %src0 , i64 %c4_vscale
24
19
%dest1 = getelementptr inbounds i8 , ptr %dest0 , i64 4
25
- %vec0 = load <vscale x 4 x float >, ptr %src0 , align 4
26
- %vec1 = load <vscale x 4 x float >, ptr %src1 , align 4
27
- %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec0 )
28
- %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec1 )
20
+ %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec0 )
21
+ %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec1 )
29
22
store float %reduce0 , ptr %dest0 , align 4
30
23
store float %reduce1 , ptr %dest1 , align 4
31
24
ret void
32
25
}
33
26
34
- define void @consecutive_stores_quadruple (ptr noalias %dest0 , ptr noalias %src0 ) {
27
+ define void @consecutive_stores_quadruple (ptr %dest0 ,
35
28
; CHECK-LABEL: consecutive_stores_quadruple:
36
29
; CHECK: // %bb.0:
37
30
; CHECK-NEXT: ptrue p0.s
38
- ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1]
39
- ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, #1, mul vl]
40
- ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, #2, mul vl]
41
- ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1, #3, mul vl]
42
31
; CHECK-NEXT: faddv s0, p0, z0.s
43
32
; CHECK-NEXT: faddv s1, p0, z1.s
44
33
; CHECK-NEXT: faddv s2, p0, z2.s
@@ -47,102 +36,72 @@ define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0)
47
36
; CHECK-NEXT: mov v2.s[1], v3.s[0]
48
37
; CHECK-NEXT: stp d0, d2, [x0]
49
38
; CHECK-NEXT: ret
50
- %ptrue = call <vscale x 4 x i1 > @llvm.aarch64.sve.ptrue.nxv4i1 (i32 31 )
51
- %vscale = call i64 @llvm.vscale.i64 ()
52
- %c4_vscale = shl i64 %vscale , 2
39
+ <vscale x 4 x float > %vec0 , <vscale x 4 x float > %vec1 , <vscale x 4 x float > %vec2 , <vscale x 4 x float > %vec3 )
40
+ {
53
41
%dest1 = getelementptr inbounds i8 , ptr %dest0 , i64 4
54
42
%dest2 = getelementptr inbounds i8 , ptr %dest1 , i64 4
55
43
%dest3 = getelementptr inbounds i8 , ptr %dest2 , i64 4
56
- %src1 = getelementptr inbounds float , ptr %src0 , i64 %c4_vscale
57
- %src2 = getelementptr inbounds float , ptr %src1 , i64 %c4_vscale
58
- %src3 = getelementptr inbounds float , ptr %src2 , i64 %c4_vscale
59
- %vec0 = load <vscale x 4 x float >, ptr %src0 , align 4
60
- %vec1 = load <vscale x 4 x float >, ptr %src1 , align 4
61
- %vec2 = load <vscale x 4 x float >, ptr %src2 , align 4
62
- %vec3 = load <vscale x 4 x float >, ptr %src3 , align 4
63
- %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec0 )
64
- %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec1 )
65
- %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec2 )
66
- %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec3 )
44
+ %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec0 )
45
+ %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec1 )
46
+ %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec2 )
47
+ %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec3 )
67
48
store float %reduce0 , ptr %dest0 , align 4
68
49
store float %reduce1 , ptr %dest1 , align 4
69
50
store float %reduce2 , ptr %dest2 , align 4
70
51
store float %reduce3 , ptr %dest3 , align 4
71
52
ret void
72
53
}
73
54
74
- define void @consecutive_stores_pair_streaming_function (ptr noalias %dest0 , ptr noalias %src0 ) # 0 "aarch64_pstate_sm_enabled" {
55
+ define void @consecutive_stores_pair_streaming_function (ptr %dest0 , <vscale x 4 x float > %vec0 , <vscale x 4 x float > %vec1 ) "aarch64_pstate_sm_enabled" {
75
56
; CHECK-LABEL: consecutive_stores_pair_streaming_function:
76
57
; CHECK: // %bb.0:
77
58
; CHECK-NEXT: sub sp, sp, #16
78
59
; CHECK-NEXT: .cfi_def_cfa_offset 16
79
60
; CHECK-NEXT: ptrue p0.s
80
- ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, #1, mul vl]
81
- ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
82
- ; CHECK-NEXT: faddv s0, p0, z0.s
83
61
; CHECK-NEXT: faddv s1, p0, z1.s
84
- ; CHECK-NEXT: stp s1, s0, [sp, #8]
62
+ ; CHECK-NEXT: faddv s0, p0, z0.s
63
+ ; CHECK-NEXT: stp s0, s1, [sp, #8]
85
64
; CHECK-NEXT: ldr d0, [sp, #8]
86
65
; CHECK-NEXT: str d0, [x0]
87
66
; CHECK-NEXT: add sp, sp, #16
88
67
; CHECK-NEXT: ret
89
- %ptrue = call <vscale x 4 x i1 > @llvm.aarch64.sve.ptrue.nxv4i1 (i32 31 )
90
- %vscale = call i64 @llvm.vscale.i64 ()
91
- %c4_vscale = shl i64 %vscale , 2
92
- %src1 = getelementptr inbounds float , ptr %src0 , i64 %c4_vscale
93
68
%dest1 = getelementptr inbounds i8 , ptr %dest0 , i64 4
94
- %vec0 = load <vscale x 4 x float >, ptr %src0 , align 4
95
- %vec1 = load <vscale x 4 x float >, ptr %src1 , align 4
96
- %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec0 )
97
- %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec1 )
69
+ %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec0 )
70
+ %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec1 )
98
71
store float %reduce0 , ptr %dest0 , align 4
99
72
store float %reduce1 , ptr %dest1 , align 4
100
73
ret void
101
74
}
102
75
103
- define void @consecutive_stores_quadruple_streaming_function (ptr noalias %dest0 , ptr noalias %src0 ) # 0 "aarch64_pstate_sm_enabled" {
76
+ define void @consecutive_stores_quadruple_streaming_function (ptr %dest0 ,
104
77
; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
105
78
; CHECK: // %bb.0:
106
79
; CHECK-NEXT: ptrue p0.s
107
- ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1]
108
- ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, #1, mul vl]
109
- ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, #3, mul vl]
110
- ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1, #2, mul vl]
111
80
; CHECK-NEXT: faddv s0, p0, z0.s
112
81
; CHECK-NEXT: faddv s1, p0, z1.s
113
- ; CHECK-NEXT: faddv s2, p0, z2.s
114
82
; CHECK-NEXT: faddv s3, p0, z3.s
83
+ ; CHECK-NEXT: faddv s2, p0, z2.s
115
84
; CHECK-NEXT: stp s0, s1, [sp, #-16]!
116
85
; CHECK-NEXT: .cfi_def_cfa_offset 16
117
86
; CHECK-NEXT: ldr d0, [sp]
118
87
; CHECK-NEXT: str d0, [x0]
119
- ; CHECK-NEXT: stp s3, s2 , [sp, #8]
88
+ ; CHECK-NEXT: stp s2, s3 , [sp, #8]
120
89
; CHECK-NEXT: ldr d0, [sp, #8]
121
90
; CHECK-NEXT: str d0, [x0, #8]
122
91
; CHECK-NEXT: add sp, sp, #16
123
92
; CHECK-NEXT: ret
124
- %ptrue = call <vscale x 4 x i1 > @llvm.aarch64.sve.ptrue.nxv4i1 (i32 31 )
125
- %vscale = call i64 @llvm.vscale.i64 ()
126
- %c4_vscale = shl i64 %vscale , 2
93
+ <vscale x 4 x float > %vec0 , <vscale x 4 x float > %vec1 , <vscale x 4 x float > %vec2 , <vscale x 4 x float > %vec3 ) "aarch64_pstate_sm_enabled"
94
+ {
127
95
%dest1 = getelementptr inbounds i8 , ptr %dest0 , i64 4
128
96
%dest2 = getelementptr inbounds i8 , ptr %dest1 , i64 4
129
97
%dest3 = getelementptr inbounds i8 , ptr %dest2 , i64 4
130
- %src1 = getelementptr inbounds float , ptr %src0 , i64 %c4_vscale
131
- %src2 = getelementptr inbounds float , ptr %src1 , i64 %c4_vscale
132
- %src3 = getelementptr inbounds float , ptr %src2 , i64 %c4_vscale
133
- %vec0 = load <vscale x 4 x float >, ptr %src0 , align 4
134
- %vec1 = load <vscale x 4 x float >, ptr %src1 , align 4
135
- %vec2 = load <vscale x 4 x float >, ptr %src2 , align 4
136
- %vec3 = load <vscale x 4 x float >, ptr %src3 , align 4
137
- %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec0 )
138
- %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec1 )
139
- %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec2 )
140
- %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > %ptrue , <vscale x 4 x float > %vec3 )
98
+ %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec0 )
99
+ %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec1 )
100
+ %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec2 )
101
+ %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32 (<vscale x 4 x i1 > splat(i1 true ), <vscale x 4 x float > %vec3 )
141
102
store float %reduce0 , ptr %dest0 , align 4
142
103
store float %reduce1 , ptr %dest1 , align 4
143
104
store float %reduce2 , ptr %dest2 , align 4
144
105
store float %reduce3 , ptr %dest3 , align 4
145
106
ret void
146
107
}
147
-
148
- attributes #0 = { vscale_range(1 , 16 ) "target-features" ="+sve,+sme" }
0 commit comments