@@ -1450,19 +1450,17 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1450
1450
; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
1451
1451
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
1452
1452
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
1453
- ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2]
1454
- ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
1455
- ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
1456
- ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
1457
- ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
1458
- ; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
1453
+ ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
1454
+ ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,2]
1455
+ ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
1456
+ ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,2]
1459
1457
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
1460
1458
; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3]
1461
1459
; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
1462
1460
; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
1463
1461
; SSE42-NEXT: movups %xmm5, 16(%rsi)
1464
1462
; SSE42-NEXT: movups %xmm4, (%rsi)
1465
- ; SSE42-NEXT: movdqu %xmm10 , 16(%rdx)
1463
+ ; SSE42-NEXT: movdqu %xmm8 , 16(%rdx)
1466
1464
; SSE42-NEXT: movdqu %xmm6, (%rdx)
1467
1465
; SSE42-NEXT: movups %xmm9, 16(%rcx)
1468
1466
; SSE42-NEXT: movups %xmm7, (%rcx)
@@ -1504,19 +1502,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1504
1502
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
1505
1503
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
1506
1504
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
1507
- ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1508
- ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
1509
- ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1510
- ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
1505
+ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1506
+ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1507
+ ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
1508
+ ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm3
1509
+ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1510
+ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
1511
+ ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
1511
1512
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
1512
- ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1513
- ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1514
- ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
1515
- ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
1516
- ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1517
- ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
1518
- ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
1519
- ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1520
1513
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1521
1514
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1522
1515
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -1534,26 +1527,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1534
1527
; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0
1535
1528
; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1
1536
1529
; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2
1537
- ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1538
- ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
1539
- ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1540
- ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
1530
+ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1531
+ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1532
+ ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
1533
+ ; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm3
1534
+ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1535
+ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
1536
+ ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
1541
1537
; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4
1542
- ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1543
- ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1544
- ; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
1545
- ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4
1546
- ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1547
- ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
1548
- ; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
1549
- ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1550
- ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
1551
- ; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
1552
- ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
1553
1538
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1554
- ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1539
+ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
1540
+ ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
1555
1541
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
1556
- ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1557
1542
; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rsi)
1558
1543
; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdx)
1559
1544
; AVX2-FAST-ALL-NEXT: vmovups %ymm0, (%rcx)
@@ -1565,19 +1550,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1565
1550
; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0
1566
1551
; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1
1567
1552
; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2
1568
- ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1569
- ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
1570
- ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1571
- ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
1553
+ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1554
+ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1555
+ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
1556
+ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm3
1557
+ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1558
+ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
1559
+ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
1572
1560
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4
1573
- ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1574
- ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1575
- ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
1576
- ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4
1577
- ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1578
- ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
1579
- ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5
1580
- ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1581
1561
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1582
1562
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1583
1563
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
0 commit comments