@@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
18
18
ret <4 x float > %r
19
19
}
20
20
21
+ define <4 x float > @ext0_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
22
+ ; CHECK-LABEL: @ext0_v2f32v4f32(
23
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
24
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
25
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0
26
+ ; CHECK-NEXT: ret <4 x float> [[R]]
27
+ ;
28
+ %e = extractelement <2 x float > %x , i32 0
29
+ %n = fneg float %e
30
+ %r = insertelement <4 x float > %y , float %n , i32 0
31
+ ret <4 x float > %r
32
+ }
33
+
21
34
; Eliminating extract/insert is profitable.
22
35
23
36
define <4 x float > @ext2_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
32
45
ret <4 x float > %r
33
46
}
34
47
48
+ define <4 x float > @ext2_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
49
+ ; CHECK-LABEL: @ext2_v2f32v4f32(
50
+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]]
51
+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison>
52
+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
53
+ ; CHECK-NEXT: ret <4 x float> [[R]]
54
+ ;
55
+ %e = extractelement <2 x float > %x , i32 2
56
+ %n = fneg float %e
57
+ %r = insertelement <4 x float > %y , float %n , i32 2
58
+ ret <4 x float > %r
59
+ }
60
+
35
61
; Eliminating extract/insert is still profitable. Flags propagate.
36
62
37
63
define <2 x double > @ext1_v2f64 (<2 x double > %x , <2 x double > %y ) {
@@ -46,6 +72,25 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
46
72
ret <2 x double > %r
47
73
}
48
74
75
+ define <4 x double > @ext1_v2f64v4f64 (<2 x double > %x , <4 x double > %y ) {
76
+ ; SSE-LABEL: @ext1_v2f64v4f64(
77
+ ; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
78
+ ; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]]
79
+ ; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
80
+ ; SSE-NEXT: ret <4 x double> [[R]]
81
+ ;
82
+ ; AVX-LABEL: @ext1_v2f64v4f64(
83
+ ; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
84
+ ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
85
+ ; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
86
+ ; AVX-NEXT: ret <4 x double> [[R]]
87
+ ;
88
+ %e = extractelement <2 x double > %x , i32 1
89
+ %n = fneg nsz double %e
90
+ %r = insertelement <4 x double > %y , double %n , i32 1
91
+ ret <4 x double > %r
92
+ }
93
+
49
94
; The vector fneg would cost twice as much as the scalar op with SSE,
50
95
; so we don't transform there (the shuffle would also be more expensive).
51
96
@@ -67,6 +112,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
67
112
ret <8 x float > %r
68
113
}
69
114
115
+ define <8 x float > @ext7_v4f32v8f32 (<4 x float > %x , <8 x float > %y ) {
116
+ ; CHECK-LABEL: @ext7_v4f32v8f32(
117
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
118
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
119
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
120
+ ; CHECK-NEXT: ret <8 x float> [[R]]
121
+ ;
122
+ %e = extractelement <4 x float > %x , i32 3
123
+ %n = fneg float %e
124
+ %r = insertelement <8 x float > %y , float %n , i32 7
125
+ ret <8 x float > %r
126
+ }
127
+
70
128
; Same as above with an extra use of the extracted element.
71
129
72
130
define <8 x float > @ext7_v8f32_use1 (<8 x float > %x , <8 x float > %y ) {
@@ -91,6 +149,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
91
149
ret <8 x float > %r
92
150
}
93
151
152
+ define <8 x float > @ext7_v4f32v8f32_use1 (<4 x float > %x , <8 x float > %y ) {
153
+ ; CHECK-LABEL: @ext7_v4f32v8f32_use1(
154
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
155
+ ; CHECK-NEXT: call void @use(float [[E]])
156
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
157
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
158
+ ; CHECK-NEXT: ret <8 x float> [[R]]
159
+ ;
160
+ %e = extractelement <4 x float > %x , i32 3
161
+ call void @use (float %e )
162
+ %n = fneg float %e
163
+ %r = insertelement <8 x float > %y , float %n , i32 3
164
+ ret <8 x float > %r
165
+ }
166
+
94
167
; Negative test - the transform is likely not profitable if the fneg has another use.
95
168
96
169
define <8 x float > @ext7_v8f32_use2 (<8 x float > %x , <8 x float > %y ) {
@@ -108,6 +181,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
108
181
ret <8 x float > %r
109
182
}
110
183
184
+ define <8 x float > @ext7_v4f32v8f32_use2 (<4 x float > %x , <8 x float > %y ) {
185
+ ; CHECK-LABEL: @ext7_v4f32v8f32_use2(
186
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
187
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
188
+ ; CHECK-NEXT: call void @use(float [[N]])
189
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
190
+ ; CHECK-NEXT: ret <8 x float> [[R]]
191
+ ;
192
+ %e = extractelement <4 x float > %x , i32 3
193
+ %n = fneg float %e
194
+ call void @use (float %n )
195
+ %r = insertelement <8 x float > %y , float %n , i32 3
196
+ ret <8 x float > %r
197
+ }
198
+
111
199
; Negative test - can't convert variable index to a shuffle.
112
200
113
201
define <2 x double > @ext_index_var_v2f64 (<2 x double > %x , <2 x double > %y , i32 %index ) {
@@ -123,6 +211,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
123
211
ret <2 x double > %r
124
212
}
125
213
214
+ define <4 x double > @ext_index_var_v2f64v4f64 (<2 x double > %x , <4 x double > %y , i32 %index ) {
215
+ ; CHECK-LABEL: @ext_index_var_v2f64v4f64(
216
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
217
+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
218
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]]
219
+ ; CHECK-NEXT: ret <4 x double> [[R]]
220
+ ;
221
+ %e = extractelement <2 x double > %x , i32 %index
222
+ %n = fneg nsz double %e
223
+ %r = insertelement <4 x double > %y , double %n , i32 %index
224
+ ret <4 x double > %r
225
+ }
226
+
126
227
; Negative test - require same extract/insert index for simple shuffle.
127
228
; TODO: We could handle this by adjusting the cost calculation.
128
229
@@ -139,6 +240,33 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
139
240
ret <2 x double > %r
140
241
}
141
242
243
+ ; Negative test - extract from an index greater than the vector width of the destination
244
+ define <2 x double > @ext3_v4f64v2f64 (<4 x double > %x , <2 x double > %y ) {
245
+ ; CHECK-LABEL: @ext3_v4f64v2f64(
246
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
247
+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
248
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
249
+ ; CHECK-NEXT: ret <2 x double> [[R]]
250
+ ;
251
+ %e = extractelement <4 x double > %x , i32 3
252
+ %n = fneg nsz double %e
253
+ %r = insertelement <2 x double > %y , double %n , i32 1
254
+ ret <2 x double > %r
255
+ }
256
+
257
+ define <4 x double > @ext1_v2f64v4f64_ins0 (<2 x double > %x , <4 x double > %y ) {
258
+ ; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
259
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
260
+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
261
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
262
+ ; CHECK-NEXT: ret <4 x double> [[R]]
263
+ ;
264
+ %e = extractelement <2 x double > %x , i32 1
265
+ %n = fneg nsz double %e
266
+ %r = insertelement <4 x double > %y , double %n , i32 0
267
+ ret <4 x double > %r
268
+ }
269
+
142
270
; Negative test - avoid changing poison ops
143
271
144
272
define <4 x float > @ext12_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -154,6 +282,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
154
282
ret <4 x float > %r
155
283
}
156
284
285
+ define <4 x float > @ext12_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
286
+ ; CHECK-LABEL: @ext12_v2f32v4f32(
287
+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6
288
+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
289
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12
290
+ ; CHECK-NEXT: ret <4 x float> [[R]]
291
+ ;
292
+ %e = extractelement <2 x float > %x , i32 6
293
+ %n = fneg float %e
294
+ %r = insertelement <4 x float > %y , float %n , i32 12
295
+ ret <4 x float > %r
296
+ }
297
+
157
298
; This used to crash because we assumed matching a true, unary fneg instruction.
158
299
159
300
define <2 x float > @ext1_v2f32_fsub (<2 x float > %x ) {
@@ -181,3 +322,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) {
181
322
%r = insertelement <2 x float > %y , float %s , i32 1
182
323
ret <2 x float > %r
183
324
}
325
+
326
+ define <4 x float > @ext1_v2f32v4f32_fsub_fmf (<2 x float > %x , <4 x float > %y ) {
327
+ ; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf(
328
+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]]
329
+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
330
+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
331
+ ; CHECK-NEXT: ret <4 x float> [[R]]
332
+ ;
333
+ %e = extractelement <2 x float > %x , i32 1
334
+ %s = fsub nsz nnan float 0 .0 , %e
335
+ %r = insertelement <4 x float > %y , float %s , i32 1
336
+ ret <4 x float > %r
337
+ }
0 commit comments