@@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
18
18
ret <4 x float > %r
19
19
}
20
20
21
+ define <4 x float > @ext0_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
22
+ ; CHECK-LABEL: @ext0_v2f32v4f32(
23
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
24
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
25
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0
26
+ ; CHECK-NEXT: ret <4 x float> [[R]]
27
+ ;
28
+ %e = extractelement <2 x float > %x , i32 0
29
+ %n = fneg float %e
30
+ %r = insertelement <4 x float > %y , float %n , i32 0
31
+ ret <4 x float > %r
32
+ }
33
+
21
34
; Eliminating extract/insert is profitable.
22
35
23
36
define <4 x float > @ext2_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
32
45
ret <4 x float > %r
33
46
}
34
47
48
+ define <4 x float > @ext2_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
49
+ ; CHECK-LABEL: @ext2_v2f32v4f32(
50
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 2
51
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
52
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2
53
+ ; CHECK-NEXT: ret <4 x float> [[R]]
54
+ ;
55
+ %e = extractelement <2 x float > %x , i32 2
56
+ %n = fneg float %e
57
+ %r = insertelement <4 x float > %y , float %n , i32 2
58
+ ret <4 x float > %r
59
+ }
60
+
35
61
; Eliminating extract/insert is still profitable. Flags propagate.
36
62
37
63
define <2 x double > @ext1_v2f64 (<2 x double > %x , <2 x double > %y ) {
@@ -46,6 +72,19 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
46
72
ret <2 x double > %r
47
73
}
48
74
75
+ define <4 x double > @ext1_v2f64v4f64 (<2 x double > %x , <4 x double > %y ) {
76
+ ; CHECK-LABEL: @ext1_v2f64v4f64(
77
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
78
+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
79
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
80
+ ; CHECK-NEXT: ret <4 x double> [[R]]
81
+ ;
82
+ %e = extractelement <2 x double > %x , i32 1
83
+ %n = fneg nsz double %e
84
+ %r = insertelement <4 x double > %y , double %n , i32 1
85
+ ret <4 x double > %r
86
+ }
87
+
49
88
; The vector fneg would cost twice as much as the scalar op with SSE,
50
89
; so we don't transform there (the shuffle would also be more expensive).
51
90
@@ -67,6 +106,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
67
106
ret <8 x float > %r
68
107
}
69
108
109
+ define <8 x float > @ext7_v4f32v8f32 (<4 x float > %x , <8 x float > %y ) {
110
+ ; CHECK-LABEL: @ext7_v4f32v8f32(
111
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
112
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
113
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
114
+ ; CHECK-NEXT: ret <8 x float> [[R]]
115
+ ;
116
+ %e = extractelement <4 x float > %x , i32 3
117
+ %n = fneg float %e
118
+ %r = insertelement <8 x float > %y , float %n , i32 7
119
+ ret <8 x float > %r
120
+ }
121
+
70
122
; Same as above with an extra use of the extracted element.
71
123
72
124
define <8 x float > @ext7_v8f32_use1 (<8 x float > %x , <8 x float > %y ) {
@@ -91,6 +143,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
91
143
ret <8 x float > %r
92
144
}
93
145
146
+ define <8 x float > @ext7_v4f32v8f32_use1 (<4 x float > %x , <8 x float > %y ) {
147
+ ; CHECK-LABEL: @ext7_v4f32v8f32_use1(
148
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
149
+ ; CHECK-NEXT: call void @use(float [[E]])
150
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
151
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
152
+ ; CHECK-NEXT: ret <8 x float> [[R]]
153
+ ;
154
+ %e = extractelement <4 x float > %x , i32 3
155
+ call void @use (float %e )
156
+ %n = fneg float %e
157
+ %r = insertelement <8 x float > %y , float %n , i32 3
158
+ ret <8 x float > %r
159
+ }
160
+
94
161
; Negative test - the transform is likely not profitable if the fneg has another use.
95
162
96
163
define <8 x float > @ext7_v8f32_use2 (<8 x float > %x , <8 x float > %y ) {
@@ -108,6 +175,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
108
175
ret <8 x float > %r
109
176
}
110
177
178
+ define <8 x float > @ext7_v4f32v8f32_use2 (<4 x float > %x , <8 x float > %y ) {
179
+ ; CHECK-LABEL: @ext7_v4f32v8f32_use2(
180
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
181
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
182
+ ; CHECK-NEXT: call void @use(float [[N]])
183
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
184
+ ; CHECK-NEXT: ret <8 x float> [[R]]
185
+ ;
186
+ %e = extractelement <4 x float > %x , i32 3
187
+ %n = fneg float %e
188
+ call void @use (float %n )
189
+ %r = insertelement <8 x float > %y , float %n , i32 3
190
+ ret <8 x float > %r
191
+ }
192
+
111
193
; Negative test - can't convert variable index to a shuffle.
112
194
113
195
define <2 x double > @ext_index_var_v2f64 (<2 x double > %x , <2 x double > %y , i32 %index ) {
@@ -123,6 +205,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
123
205
ret <2 x double > %r
124
206
}
125
207
208
+ define <4 x double > @ext_index_var_v2f64v4f64 (<2 x double > %x , <4 x double > %y , i32 %index ) {
209
+ ; CHECK-LABEL: @ext_index_var_v2f64v4f64(
210
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
211
+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
212
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]]
213
+ ; CHECK-NEXT: ret <4 x double> [[R]]
214
+ ;
215
+ %e = extractelement <2 x double > %x , i32 %index
216
+ %n = fneg nsz double %e
217
+ %r = insertelement <4 x double > %y , double %n , i32 %index
218
+ ret <4 x double > %r
219
+ }
220
+
126
221
; Negative test - require same extract/insert index for simple shuffle.
127
222
; TODO: We could handle this by adjusting the cost calculation.
128
223
@@ -139,6 +234,19 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
139
234
ret <2 x double > %r
140
235
}
141
236
237
+ define <4 x double > @ext1_v2f64v4f64_ins0 (<2 x double > %x , <4 x double > %y ) {
238
+ ; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
239
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
240
+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
241
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
242
+ ; CHECK-NEXT: ret <4 x double> [[R]]
243
+ ;
244
+ %e = extractelement <2 x double > %x , i32 1
245
+ %n = fneg nsz double %e
246
+ %r = insertelement <4 x double > %y , double %n , i32 0
247
+ ret <4 x double > %r
248
+ }
249
+
142
250
; Negative test - avoid changing poison ops
143
251
144
252
define <4 x float > @ext12_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -154,6 +262,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
154
262
ret <4 x float > %r
155
263
}
156
264
265
+ define <4 x float > @ext12_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
266
+ ; CHECK-LABEL: @ext12_v2f32v4f32(
267
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6
268
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
269
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12
270
+ ; CHECK-NEXT: ret <4 x float> [[R]]
271
+ ;
272
+ %e = extractelement <2 x float > %x , i32 6
273
+ %n = fneg float %e
274
+ %r = insertelement <4 x float > %y , float %n , i32 12
275
+ ret <4 x float > %r
276
+ }
277
+
157
278
; This used to crash because we assumed matching a true, unary fneg instruction.
158
279
159
280
define <2 x float > @ext1_v2f32_fsub (<2 x float > %x ) {
@@ -181,3 +302,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) {
181
302
%r = insertelement <2 x float > %y , float %s , i32 1
182
303
ret <2 x float > %r
183
304
}
305
+
306
+ define <4 x float > @ext1_v2f32v4f32_fsub_fmf (<2 x float > %x , <4 x float > %y ) {
307
+ ; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf(
308
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
309
+ ; CHECK-NEXT: [[S:%.*]] = fsub nnan nsz float 0.000000e+00, [[E]]
310
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[S]], i32 1
311
+ ; CHECK-NEXT: ret <4 x float> [[R]]
312
+ ;
313
+ %e = extractelement <2 x float > %x , i32 1
314
+ %s = fsub nsz nnan float 0 .0 , %e
315
+ %r = insertelement <4 x float > %y , float %s , i32 1
316
+ ret <4 x float > %r
317
+ }
0 commit comments