@@ -2508,3 +2508,50 @@ define void @D107009(ptr %input, ptr %output) {
2508
2508
store <64 x i32 > %i7 , ptr %output , align 16
2509
2509
ret void
2510
2510
}
2511
+
2512
+ ; Ensure concatenation of repeated subvector loads before vector can be split apart.
2513
+ define void @split_v2i64_subvector_broadcast (ptr readonly align 8 captures(none) dereferenceable (64 ) %arg ) {
2514
+ ; SSE-LABEL: split_v2i64_subvector_broadcast:
2515
+ ; SSE: # %bb.0:
2516
+ ; SSE-NEXT: movups 8(%rdi), %xmm0
2517
+ ; SSE-NEXT: movups 40(%rdi), %xmm1
2518
+ ; SSE-NEXT: movaps %xmm0, %xmm2
2519
+ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2520
+ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2521
+ ; SSE-NEXT: movups %xmm0, (%rax)
2522
+ ; SSE-NEXT: movups %xmm2, (%rax)
2523
+ ; SSE-NEXT: retq
2524
+ ;
2525
+ ; AVX1-LABEL: split_v2i64_subvector_broadcast:
2526
+ ; AVX1: # %bb.0:
2527
+ ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2528
+ ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
2529
+ ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2530
+ ; AVX1-NEXT: vmovupd %ymm0, (%rax)
2531
+ ; AVX1-NEXT: vzeroupper
2532
+ ; AVX1-NEXT: retq
2533
+ ;
2534
+ ; AVX2-LABEL: split_v2i64_subvector_broadcast:
2535
+ ; AVX2: # %bb.0:
2536
+ ; AVX2-NEXT: vmovups 40(%rdi), %xmm0
2537
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2538
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,1,3]
2539
+ ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2540
+ ; AVX2-NEXT: vmovups %ymm0, (%rax)
2541
+ ; AVX2-NEXT: vzeroupper
2542
+ ; AVX2-NEXT: retq
2543
+ ;
2544
+ ; XOP-LABEL: split_v2i64_subvector_broadcast:
2545
+ ; XOP: # %bb.0:
2546
+ ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2547
+ ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
2548
+ ; XOP-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2549
+ ; XOP-NEXT: vmovupd %ymm0, (%rax)
2550
+ ; XOP-NEXT: vzeroupper
2551
+ ; XOP-NEXT: retq
2552
+ %gep = getelementptr inbounds nuw i8 , ptr %arg , i64 8
2553
+ %load = load <6 x i64 >, ptr %gep , align 8
2554
+ %shuffle = shufflevector <6 x i64 > %load , <6 x i64 > poison, <4 x i32 > <i32 0 , i32 4 , i32 1 , i32 5 >
2555
+ store <4 x i64 > %shuffle , ptr poison, align 8
2556
+ ret void
2557
+ }
0 commit comments