Skip to content

Commit bf51609

Browse files
authored
[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded (#133923)
With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements.
1 parent 2e7ed78 commit bf51609

10 files changed

+217
-152
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43812,6 +43812,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4381243812
}
4381343813
break;
4381443814
}
43815+
case X86ISD::VPERMV: {
43816+
SmallVector<int, 16> Mask;
43817+
SmallVector<SDValue, 2> Ops;
43818+
if ((VT.is256BitVector() || Subtarget.hasVLX()) &&
43819+
getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
43820+
// For lane-crossing shuffles, only split in half in case we're still
43821+
// referencing higher elements.
43822+
unsigned HalfElts = NumElts / 2;
43823+
unsigned HalfSize = SizeInBits / 2;
43824+
Mask.resize(HalfElts);
43825+
if (all_of(Mask,
43826+
[&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
43827+
MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
43828+
SDLoc DL(Op);
43829+
SDValue Ext;
43830+
SDValue M =
43831+
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
43832+
SDValue V =
43833+
extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
43834+
// For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
43835+
if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
43836+
Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
43837+
else
43838+
Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
43839+
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
43840+
Subtarget, TLO.DAG, DL, SizeInBits);
43841+
return TLO.CombineTo(Op, Insert);
43842+
}
43843+
}
43844+
break;
43845+
}
43846+
case X86ISD::VPERMV3: {
43847+
SmallVector<int, 16> Mask;
43848+
SmallVector<SDValue, 2> Ops;
43849+
if (Subtarget.hasVLX() &&
43850+
getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
43851+
// For lane-crossing shuffles, only split in half in case we're still
43852+
// referencing higher elements.
43853+
unsigned HalfElts = NumElts / 2;
43854+
unsigned HalfSize = SizeInBits / 2;
43855+
Mask.resize(HalfElts);
43856+
if (all_of(Mask, [&](int M) {
43857+
return isUndefOrInRange(M, 0, HalfElts) ||
43858+
isUndefOrInRange(M, NumElts, NumElts + HalfElts);
43859+
})) {
43860+
// Adjust mask elements for 2nd operand to point to half width.
43861+
for (int &M : Mask)
43862+
M = M <= NumElts ? M : (M - HalfElts);
43863+
MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
43864+
MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
43865+
SDLoc DL(Op);
43866+
SDValue Ext = TLO.DAG.getNode(
43867+
Opc, DL, HalfVT,
43868+
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
43869+
getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
43870+
extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
43871+
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
43872+
Subtarget, TLO.DAG, DL, SizeInBits);
43873+
return TLO.CombineTo(Op, Insert);
43874+
}
43875+
}
43876+
break;
43877+
}
4381543878
case X86ISD::VPERM2X128: {
4381643879
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
4381743880
SDLoc DL(Op);

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
749749
;
750750
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751751
; AVX512BW-SLOW: # %bb.0:
752-
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
753-
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
755-
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
752+
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
753+
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
754+
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
755+
; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
756756
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
757757
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
758758
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
763763
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
764764
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765765
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
766-
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
766+
; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
767767
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
768768
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
769769
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
870870
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
871871
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872872
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
873-
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
873+
; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
874874
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
875875
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876876
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
883883
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
884884
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885885
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
886-
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
886+
; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
887887
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
888888
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889889
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
10001000
;
10011001
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
10021002
; AVX512BW: # %bb.0:
1003-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1004-
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
1005-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1006-
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
1003+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
1004+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
1005+
; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1006+
; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
10071007
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
10081008
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
10091009
; AVX512BW-NEXT: vzeroupper
@@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46104610
; AVX512F: # %bb.0:
46114611
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
46124612
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4613-
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
46144613
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4615-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4616-
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4614+
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4615+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
4616+
; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
46174617
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46184618
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
46194619
; AVX512F-NEXT: vzeroupper
@@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46234623
; AVX512DQ: # %bb.0:
46244624
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
46254625
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4626-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
46274626
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4628-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4629-
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4627+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4628+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
4629+
; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
46304630
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46314631
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
46324632
; AVX512DQ-NEXT: vzeroupper
@@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48684868
; AVX512F: # %bb.0:
48694869
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
48704870
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4871-
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
48724871
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4873-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4874-
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4872+
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4873+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
4874+
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
48754875
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
48764876
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
48774877
; AVX512F-NEXT: vzeroupper
@@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48814881
; AVX512DQ: # %bb.0:
48824882
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
48834883
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4884-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
48854884
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4886-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4887-
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4885+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4886+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
4887+
; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
48884888
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
48894889
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
48904890
; AVX512DQ-NEXT: vzeroupper

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
644644
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645645
; AVX512BW-SLOW: # %bb.0:
646646
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
647-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
647+
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
648648
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
649649
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
650650
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
653653
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
654654
; AVX512BW-FAST: # %bb.0:
655655
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
656-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
656+
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
657657
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
658658
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
659659
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
738738
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
739739
; AVX512BW-SLOW: # %bb.0:
740740
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
741-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
741+
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
742742
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
743743
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
744744
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
748748
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
749749
; AVX512BW-FAST: # %bb.0:
750750
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
751-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
751+
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
752752
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
753753
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
754754
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)

llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
11131113
;
11141114
; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
11151115
; AVX512VBMI-FAST: # %bb.0:
1116-
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
1117-
; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
1116+
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
1117+
; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2
11181118
; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
11191119
; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
11201120
; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
@@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
11241124
;
11251125
; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
11261126
; AVX512VBMI-SLOW: # %bb.0:
1127-
; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
1128-
; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1129-
; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
1127+
; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
1128+
; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2
1129+
; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2
11301130
; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
11311131
; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
11321132
; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
11331133
; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
1134-
; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
1134+
; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0
11351135
; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
11361136
; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
11371137
; AVX512VBMI-SLOW-NEXT: vzeroupper

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
593593
; AVX512BW-LABEL: load_i16_stride5_vf4:
594594
; AVX512BW: # %bb.0:
595595
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
597-
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
596+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
597+
; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
598598
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599-
; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
599+
; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
600+
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
600601
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
601602
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
602-
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
603+
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
603604
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
604605
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
605-
; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
606+
; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
606607
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
607-
; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
608+
; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
608609
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
609-
; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
610-
; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
610+
; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
611+
; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
611612
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
612613
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
613614
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
614-
; AVX512BW-NEXT: vmovq %xmm1, (%r9)
615+
; AVX512BW-NEXT: vmovq %xmm2, (%r9)
615616
; AVX512BW-NEXT: vzeroupper
616617
; AVX512BW-NEXT: retq
617618
;
618619
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
619620
; AVX512BW-FCP: # %bb.0:
620621
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
621-
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
622-
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
622+
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
623+
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
623624
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
624-
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
625+
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
626+
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
625627
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
626628
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
627-
; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
629+
; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
628630
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
629631
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
630-
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
632+
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
631633
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
632-
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
634+
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
633635
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
634-
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
635-
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
636+
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
637+
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi)
636638
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
637639
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
638640
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
639-
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
641+
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
640642
; AVX512BW-FCP-NEXT: vzeroupper
641643
; AVX512BW-FCP-NEXT: retq
642644
;
643645
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
644646
; AVX512DQ-BW: # %bb.0:
645647
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
646-
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
647-
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
648+
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
649+
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
648650
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
649-
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
651+
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
652+
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
650653
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
651654
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
652-
; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
655+
; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
653656
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
654657
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
655-
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
658+
; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
656659
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
657-
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
660+
; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
658661
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
659-
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
660-
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
662+
; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
663+
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
661664
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
662665
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
663666
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
664-
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9)
667+
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
665668
; AVX512DQ-BW-NEXT: vzeroupper
666669
; AVX512DQ-BW-NEXT: retq
667670
;
668671
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
669672
; AVX512DQ-BW-FCP: # %bb.0:
670673
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
671-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
672-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
674+
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
675+
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
673676
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
674-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
677+
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
678+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
675679
; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
676680
; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
677-
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
681+
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
678682
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
679683
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
680-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
684+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
681685
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
682-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
686+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
683687
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
684-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
685-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
688+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
689+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi)
686690
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
687691
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
688692
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
689-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9)
693+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9)
690694
; AVX512DQ-BW-FCP-NEXT: vzeroupper
691695
; AVX512DQ-BW-FCP-NEXT: retq
692696
%wide.vec = load <20 x i16>, ptr %in.vec, align 64

0 commit comments

Comments
 (0)