@@ -179,22 +179,53 @@ define <8 x i16> @combine_pmaddubsw_zero_commute(<16 x i8> %a0, <16 x i8> %a1) {
179
179
ret <8 x i16 > %1
180
180
}
181
181
182
- define <16 x i16 > @combine_pmaddubsw_concat (<16 x i8 > %a0 , <16 x i8 > %a1 , < 16 x i8 > %a2 , < 16 x i8 > %a3 ) {
182
+ define <16 x i16 > @combine_pmaddubsw_concat (<32 x i8 > %a0 , <32 x i8 > %a1 ) {
183
183
; SSE-LABEL: combine_pmaddubsw_concat:
184
184
; SSE: # %bb.0:
185
+ ; SSE-NEXT: pmaddubsw %xmm2, %xmm0
186
+ ; SSE-NEXT: pmaddubsw %xmm3, %xmm1
187
+ ; SSE-NEXT: retq
188
+ ;
189
+ ; AVX1-LABEL: combine_pmaddubsw_concat:
190
+ ; AVX1: # %bb.0:
191
+ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
192
+ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
193
+ ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2
194
+ ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
195
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
196
+ ; AVX1-NEXT: retq
197
+ ;
198
+ ; AVX2-LABEL: combine_pmaddubsw_concat:
199
+ ; AVX2: # %bb.0:
200
+ ; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
201
+ ; AVX2-NEXT: retq
202
+ %lo0 = shufflevector <32 x i8 > %a0 , <32 x i8 > undef , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
203
+ %lo1 = shufflevector <32 x i8 > %a1 , <32 x i8 > undef , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
204
+ %hi0 = shufflevector <32 x i8 > %a0 , <32 x i8 > undef , <16 x i32 > <i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 >
205
+ %hi1 = shufflevector <32 x i8 > %a1 , <32 x i8 > undef , <16 x i32 > <i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 >
206
+ %lo = call <8 x i16 > @llvm.x86.ssse3.pmadd.ub.sw.128 (<16 x i8 > %lo0 , <16 x i8 > %lo1 )
207
+ %hi = call <8 x i16 > @llvm.x86.ssse3.pmadd.ub.sw.128 (<16 x i8 > %hi0 , <16 x i8 > %hi1 )
208
+ %res = shufflevector <8 x i16 > %lo , <8 x i16 > %hi , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
209
+ ret <16 x i16 > %res
210
+ }
211
+
212
+ ; TODO: Not beneficial to concatenate both inputs just to create a 256-bit pmaddubsw
213
+ define <16 x i16 > @combine_pmaddubsw_concat_unecessary (<16 x i8 > %a0 , <16 x i8 > %a1 , <16 x i8 > %a2 , <16 x i8 > %a3 ) {
214
+ ; SSE-LABEL: combine_pmaddubsw_concat_unecessary:
215
+ ; SSE: # %bb.0:
185
216
; SSE-NEXT: pmaddubsw %xmm1, %xmm0
186
217
; SSE-NEXT: pmaddubsw %xmm3, %xmm2
187
218
; SSE-NEXT: movdqa %xmm2, %xmm1
188
219
; SSE-NEXT: retq
189
220
;
190
- ; AVX1-LABEL: combine_pmaddubsw_concat :
221
+ ; AVX1-LABEL: combine_pmaddubsw_concat_unecessary :
191
222
; AVX1: # %bb.0:
192
223
; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
193
224
; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm1
194
225
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
195
226
; AVX1-NEXT: retq
196
227
;
197
- ; AVX2-LABEL: combine_pmaddubsw_concat :
228
+ ; AVX2-LABEL: combine_pmaddubsw_concat_unecessary :
198
229
; AVX2: # %bb.0:
199
230
; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
200
231
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
0 commit comments