@@ -1467,79 +1467,42 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1467
1467
; AVX512F-NEXT: vzeroupper
1468
1468
; AVX512F-NEXT: retq
1469
1469
;
1470
- ; AVX512BW-ONLY-LABEL: load_i8_stride3_vf64:
1471
- ; AVX512BW-ONLY: # %bb.0:
1472
- ; AVX512BW-ONLY-NEXT: vmovdqa (%rdi), %xmm0
1473
- ; AVX512BW-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
1474
- ; AVX512BW-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
1475
- ; AVX512BW-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
1476
- ; AVX512BW-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4
1477
- ; AVX512BW-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5
1478
- ; AVX512BW-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1479
- ; AVX512BW-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1480
- ; AVX512BW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1481
- ; AVX512BW-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1482
- ; AVX512BW-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1483
- ; AVX512BW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1484
- ; AVX512BW-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1485
- ; AVX512BW-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1486
- ; AVX512BW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1487
- ; AVX512BW-ONLY-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1488
- ; AVX512BW-ONLY-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1489
- ; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1490
- ; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1491
- ; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1492
- ; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1493
- ; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1494
- ; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1495
- ; AVX512BW-ONLY-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1496
- ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
1497
- ; AVX512BW-ONLY-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1498
- ; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1499
- ; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1500
- ; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1501
- ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rsi)
1502
- ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, (%rdx)
1503
- ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rcx)
1504
- ; AVX512BW-ONLY-NEXT: vzeroupper
1505
- ; AVX512BW-ONLY-NEXT: retq
1506
- ;
1507
- ; AVX512DQBW-ONLY-LABEL: load_i8_stride3_vf64:
1508
- ; AVX512DQBW-ONLY: # %bb.0:
1509
- ; AVX512DQBW-ONLY-NEXT: vmovdqa (%rdi), %xmm0
1510
- ; AVX512DQBW-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
1511
- ; AVX512DQBW-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
1512
- ; AVX512DQBW-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
1513
- ; AVX512DQBW-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4
1514
- ; AVX512DQBW-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5
1515
- ; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1516
- ; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1517
- ; AVX512DQBW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1518
- ; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1519
- ; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1520
- ; AVX512DQBW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1521
- ; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1522
- ; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1523
- ; AVX512DQBW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1524
- ; AVX512DQBW-ONLY-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1525
- ; AVX512DQBW-ONLY-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
1526
- ; AVX512DQBW-ONLY-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1527
- ; AVX512DQBW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1528
- ; AVX512DQBW-ONLY-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1529
- ; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1530
- ; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1531
- ; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1532
- ; AVX512DQBW-ONLY-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1533
- ; AVX512DQBW-ONLY-NEXT: kmovq %rax, %k1
1534
- ; AVX512DQBW-ONLY-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1535
- ; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1536
- ; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1537
- ; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1538
- ; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm0, (%rsi)
1539
- ; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm2, (%rdx)
1540
- ; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm1, (%rcx)
1541
- ; AVX512DQBW-ONLY-NEXT: vzeroupper
1542
- ; AVX512DQBW-ONLY-NEXT: retq
1470
+ ; AVX512BW-LABEL: load_i8_stride3_vf64:
1471
+ ; AVX512BW: # %bb.0:
1472
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1473
+ ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1474
+ ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
1475
+ ; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3
1476
+ ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4
1477
+ ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5
1478
+ ; AVX512BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1479
+ ; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1480
+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1481
+ ; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1482
+ ; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1483
+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1484
+ ; AVX512BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1485
+ ; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1486
+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1487
+ ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1488
+ ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1489
+ ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1490
+ ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1491
+ ; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1492
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1493
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1494
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1495
+ ; AVX512BW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1496
+ ; AVX512BW-NEXT: kmovq %rax, %k1
1497
+ ; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1498
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1499
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1500
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1501
+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
1502
+ ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
1503
+ ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx)
1504
+ ; AVX512BW-NEXT: vzeroupper
1505
+ ; AVX512BW-NEXT: retq
1543
1506
%wide.vec = load <192 x i8 >, ptr %in.vec , align 64
1544
1507
%strided.vec0 = shufflevector <192 x i8 > %wide.vec , <192 x i8 > poison, <64 x i32 > <i32 0 , i32 3 , i32 6 , i32 9 , i32 12 , i32 15 , i32 18 , i32 21 , i32 24 , i32 27 , i32 30 , i32 33 , i32 36 , i32 39 , i32 42 , i32 45 , i32 48 , i32 51 , i32 54 , i32 57 , i32 60 , i32 63 , i32 66 , i32 69 , i32 72 , i32 75 , i32 78 , i32 81 , i32 84 , i32 87 , i32 90 , i32 93 , i32 96 , i32 99 , i32 102 , i32 105 , i32 108 , i32 111 , i32 114 , i32 117 , i32 120 , i32 123 , i32 126 , i32 129 , i32 132 , i32 135 , i32 138 , i32 141 , i32 144 , i32 147 , i32 150 , i32 153 , i32 156 , i32 159 , i32 162 , i32 165 , i32 168 , i32 171 , i32 174 , i32 177 , i32 180 , i32 183 , i32 186 , i32 189 >
1545
1508
%strided.vec1 = shufflevector <192 x i8 > %wide.vec , <192 x i8 > poison, <64 x i32 > <i32 1 , i32 4 , i32 7 , i32 10 , i32 13 , i32 16 , i32 19 , i32 22 , i32 25 , i32 28 , i32 31 , i32 34 , i32 37 , i32 40 , i32 43 , i32 46 , i32 49 , i32 52 , i32 55 , i32 58 , i32 61 , i32 64 , i32 67 , i32 70 , i32 73 , i32 76 , i32 79 , i32 82 , i32 85 , i32 88 , i32 91 , i32 94 , i32 97 , i32 100 , i32 103 , i32 106 , i32 109 , i32 112 , i32 115 , i32 118 , i32 121 , i32 124 , i32 127 , i32 130 , i32 133 , i32 136 , i32 139 , i32 142 , i32 145 , i32 148 , i32 151 , i32 154 , i32 157 , i32 160 , i32 163 , i32 166 , i32 169 , i32 172 , i32 175 , i32 178 , i32 181 , i32 184 , i32 187 , i32 190 >
@@ -1558,13 +1521,15 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1558
1521
; AVX512-FAST: {{.*}}
1559
1522
; AVX512-SLOW: {{.*}}
1560
1523
; AVX512BW-FAST: {{.*}}
1524
+ ; AVX512BW-ONLY: {{.*}}
1561
1525
; AVX512BW-ONLY-FAST: {{.*}}
1562
1526
; AVX512BW-ONLY-SLOW: {{.*}}
1563
1527
; AVX512BW-SLOW: {{.*}}
1564
1528
; AVX512DQ-FAST: {{.*}}
1565
1529
; AVX512DQ-ONLY: {{.*}}
1566
1530
; AVX512DQ-SLOW: {{.*}}
1567
1531
; AVX512DQBW-FAST: {{.*}}
1532
+ ; AVX512DQBW-ONLY: {{.*}}
1568
1533
; AVX512DQBW-SLOW: {{.*}}
1569
1534
; AVX512F-FAST: {{.*}}
1570
1535
; AVX512F-ONLY: {{.*}}
0 commit comments