Skip to content

Commit 539e60c

Browse files
committed
[X86] X86FixupVectorConstantsPass - consistently use non-DQI 128/256-bit subvector broadcasts
Without the predicate there's no benefit to using the DQI variants instead of the default AVX512F instructions
1 parent 899fd0c commit 539e60c

17 files changed

+9698
-19290
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -297,17 +297,16 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
297297
case X86::VMOVAPSZ256rm:
298298
case X86::VMOVUPDZ256rm:
299299
case X86::VMOVUPSZ256rm:
300-
return ConvertToBroadcast(
301-
0, HasDQI ? X86::VBROADCASTF64X2Z128rm : X86::VBROADCASTF32X4Z256rm,
302-
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm, 0, 0, 1);
300+
return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
301+
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
302+
0, 0, 1);
303303
case X86::VMOVAPDZrm:
304304
case X86::VMOVAPSZrm:
305305
case X86::VMOVUPDZrm:
306306
case X86::VMOVUPSZrm:
307-
return ConvertToBroadcast(
308-
HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm,
309-
HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm,
310-
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1);
307+
return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
308+
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
309+
1);
311310
/* Integer Loads */
312311
case X86::VMOVDQArm:
313312
case X86::VMOVDQUrm:
@@ -336,21 +335,18 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
336335
case X86::VMOVDQA64Z256rm:
337336
case X86::VMOVDQU32Z256rm:
338337
case X86::VMOVDQU64Z256rm:
339-
return ConvertToBroadcast(
340-
0, HasDQI ? X86::VBROADCASTI64X2Z128rm : X86::VBROADCASTI32X4Z256rm,
341-
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
342-
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
343-
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
338+
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
339+
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
340+
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
341+
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
344342
case X86::VMOVDQA32Zrm:
345343
case X86::VMOVDQA64Zrm:
346344
case X86::VMOVDQU32Zrm:
347345
case X86::VMOVDQU64Zrm:
348-
return ConvertToBroadcast(
349-
HasDQI ? X86::VBROADCASTI32X8rm : X86::VBROADCASTI64X4rm,
350-
HasDQI ? X86::VBROADCASTI64X2rm : X86::VBROADCASTI32X4rm,
351-
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
352-
HasBWI ? X86::VPBROADCASTWZrm : 0, HasBWI ? X86::VPBROADCASTBZrm : 0,
353-
1);
346+
return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
347+
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
348+
HasBWI ? X86::VPBROADCASTWZrm : 0,
349+
HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
354350
}
355351

356352
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll

Lines changed: 156 additions & 307 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll

Lines changed: 282 additions & 561 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll

Lines changed: 564 additions & 1122 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll

Lines changed: 893 additions & 1780 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll

Lines changed: 38 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,79 +1467,42 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
14671467
; AVX512F-NEXT: vzeroupper
14681468
; AVX512F-NEXT: retq
14691469
;
1470-
; AVX512BW-ONLY-LABEL: load_i8_stride3_vf64:
1471-
; AVX512BW-ONLY: # %bb.0:
1472-
; AVX512BW-ONLY-NEXT: vmovdqa (%rdi), %xmm0
1473-
; AVX512BW-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
1474-
; AVX512BW-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
1475-
; AVX512BW-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
1476-
; AVX512BW-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4
1477-
; AVX512BW-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5
1478-
; AVX512BW-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1479-
; AVX512BW-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1480-
; AVX512BW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1481-
; AVX512BW-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1482-
; AVX512BW-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1483-
; AVX512BW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1484-
; AVX512BW-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1485-
; AVX512BW-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1486-
; AVX512BW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1487-
; AVX512BW-ONLY-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1488-
; AVX512BW-ONLY-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1489-
; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1490-
; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1491-
; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1492-
; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1493-
; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1494-
; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1495-
; AVX512BW-ONLY-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1496-
; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
1497-
; AVX512BW-ONLY-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1498-
; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1499-
; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1500-
; AVX512BW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1501-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rsi)
1502-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, (%rdx)
1503-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rcx)
1504-
; AVX512BW-ONLY-NEXT: vzeroupper
1505-
; AVX512BW-ONLY-NEXT: retq
1506-
;
1507-
; AVX512DQBW-ONLY-LABEL: load_i8_stride3_vf64:
1508-
; AVX512DQBW-ONLY: # %bb.0:
1509-
; AVX512DQBW-ONLY-NEXT: vmovdqa (%rdi), %xmm0
1510-
; AVX512DQBW-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
1511-
; AVX512DQBW-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
1512-
; AVX512DQBW-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
1513-
; AVX512DQBW-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4
1514-
; AVX512DQBW-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5
1515-
; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1516-
; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1517-
; AVX512DQBW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1518-
; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1519-
; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1520-
; AVX512DQBW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1521-
; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1522-
; AVX512DQBW-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1523-
; AVX512DQBW-ONLY-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1524-
; AVX512DQBW-ONLY-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1525-
; AVX512DQBW-ONLY-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
1526-
; AVX512DQBW-ONLY-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1527-
; AVX512DQBW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1528-
; AVX512DQBW-ONLY-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1529-
; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1530-
; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1531-
; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1532-
; AVX512DQBW-ONLY-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1533-
; AVX512DQBW-ONLY-NEXT: kmovq %rax, %k1
1534-
; AVX512DQBW-ONLY-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1535-
; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1536-
; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1537-
; AVX512DQBW-ONLY-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1538-
; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm0, (%rsi)
1539-
; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm2, (%rdx)
1540-
; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm1, (%rcx)
1541-
; AVX512DQBW-ONLY-NEXT: vzeroupper
1542-
; AVX512DQBW-ONLY-NEXT: retq
1470+
; AVX512BW-LABEL: load_i8_stride3_vf64:
1471+
; AVX512BW: # %bb.0:
1472+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1473+
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1474+
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
1475+
; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3
1476+
; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4
1477+
; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5
1478+
; AVX512BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1479+
; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1480+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1481+
; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1482+
; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1483+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1484+
; AVX512BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1485+
; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1486+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1487+
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1488+
; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1489+
; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1490+
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1491+
; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1492+
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1493+
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1494+
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1495+
; AVX512BW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1496+
; AVX512BW-NEXT: kmovq %rax, %k1
1497+
; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1498+
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1499+
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1500+
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1501+
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
1502+
; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
1503+
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx)
1504+
; AVX512BW-NEXT: vzeroupper
1505+
; AVX512BW-NEXT: retq
15431506
%wide.vec = load <192 x i8>, ptr %in.vec, align 64
15441507
%strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
15451508
%strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
@@ -1558,13 +1521,15 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
15581521
; AVX512-FAST: {{.*}}
15591522
; AVX512-SLOW: {{.*}}
15601523
; AVX512BW-FAST: {{.*}}
1524+
; AVX512BW-ONLY: {{.*}}
15611525
; AVX512BW-ONLY-FAST: {{.*}}
15621526
; AVX512BW-ONLY-SLOW: {{.*}}
15631527
; AVX512BW-SLOW: {{.*}}
15641528
; AVX512DQ-FAST: {{.*}}
15651529
; AVX512DQ-ONLY: {{.*}}
15661530
; AVX512DQ-SLOW: {{.*}}
15671531
; AVX512DQBW-FAST: {{.*}}
1532+
; AVX512DQBW-ONLY: {{.*}}
15681533
; AVX512DQBW-SLOW: {{.*}}
15691534
; AVX512F-FAST: {{.*}}
15701535
; AVX512F-ONLY: {{.*}}

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12065,8 +12065,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1206512065
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
1206612066
; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20
1206712067
; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15
12068-
; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6]
12069-
; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1]
12068+
; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6]
12069+
; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3]
1207012070
; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20
1207112071
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
1207212072
; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
@@ -12089,8 +12089,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1208912089
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
1209012090
; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14
1209112091
; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
12092-
; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6]
12093-
; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1]
12092+
; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6]
12093+
; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3]
1209412094
; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19
1209512095
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
1209612096
; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]

0 commit comments

Comments
 (0)