@@ -469,3 +469,41 @@ define <8 x i16> @pmaddubsw_bad_indices(ptr %Aptr, ptr %Bptr) {
469
469
%trunc = trunc <8 x i32 > %min to <8 x i16 >
470
470
ret <8 x i16 > %trunc
471
471
}
472
+
473
+ define <8 x i16 > @pmaddubsw_large_vector (ptr %p1 , ptr %p2 ) {
474
+ ; SSE-LABEL: pmaddubsw_large_vector:
475
+ ; SSE: # %bb.0:
476
+ ; SSE-NEXT: movdqa (%rdi), %xmm0
477
+ ; SSE-NEXT: pmaddubsw (%rsi), %xmm0
478
+ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
479
+ ; SSE-NEXT: retq
480
+ ;
481
+ ; AVX-LABEL: pmaddubsw_large_vector:
482
+ ; AVX: # %bb.0:
483
+ ; AVX-NEXT: vmovdqa (%rdi), %xmm0
484
+ ; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0
485
+ ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
486
+ ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
487
+ ; AVX-NEXT: retq
488
+ %1 = load <64 x i8 >, ptr %p1 , align 64
489
+ %2 = shufflevector <64 x i8 > %1 , <64 x i8 > poison, <8 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 >
490
+ %3 = shufflevector <64 x i8 > %1 , <64 x i8 > poison, <8 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 >
491
+ %4 = load <32 x i8 >, ptr %p2 , align 64
492
+ %5 = shufflevector <32 x i8 > %4 , <32 x i8 > poison, <8 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 >
493
+ %6 = shufflevector <32 x i8 > %4 , <32 x i8 > poison, <8 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 >
494
+ %7 = sext <8 x i8 > %5 to <8 x i32 >
495
+ %8 = zext <8 x i8 > %2 to <8 x i32 >
496
+ %9 = mul nsw <8 x i32 > %7 , %8
497
+ %10 = sext <8 x i8 > %6 to <8 x i32 >
498
+ %11 = zext <8 x i8 > %3 to <8 x i32 >
499
+ %12 = mul nsw <8 x i32 > %10 , %11
500
+ %13 = add nsw <8 x i32 > %9 , %12
501
+ %14 = tail call <8 x i32 > @llvm.smin.v8i32 (<8 x i32 > %13 , <8 x i32 > <i32 32767 , i32 32767 , i32 32767 , i32 32767 , i32 32767 , i32 32767 , i32 32767 , i32 32767 >)
502
+ %15 = tail call <8 x i32 > @llvm.smax.v8i32 (<8 x i32 > %14 , <8 x i32 > <i32 -32768 , i32 -32768 , i32 -32768 , i32 -32768 , i32 -32768 , i32 -32768 , i32 -32768 , i32 -32768 >)
503
+ %16 = trunc <8 x i32 > %15 to <8 x i16 >
504
+ %17 = shufflevector <8 x i16 > zeroinitializer , <8 x i16 > %16 , <8 x i32 > <i32 0 , i32 1 , i32 10 , i32 3 , i32 4 , i32 13 , i32 6 , i32 15 >
505
+ ret <8 x i16 > %17
506
+ }
507
+
508
+ declare <8 x i32 > @llvm.smin.v8i32 (<8 x i32 >, <8 x i32 >)
509
+ declare <8 x i32 > @llvm.smax.v8i32 (<8 x i32 >, <8 x i32 >)
0 commit comments