Skip to content

Commit 12f75bb

Browse files
authored
Revert "[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded" (#134256)
Found a typo in the VPERMV3 mask adjustment - I'm going to revert and re-apply the patch with a fix Reverts #133923
1 parent 79435de commit 12f75bb

10 files changed

+152
-217
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -43827,69 +43827,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4382743827
}
4382843828
break;
4382943829
}
43830-
case X86ISD::VPERMV: {
43831-
SmallVector<int, 16> Mask;
43832-
SmallVector<SDValue, 2> Ops;
43833-
if ((VT.is256BitVector() || Subtarget.hasVLX()) &&
43834-
getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
43835-
// For lane-crossing shuffles, only split in half in case we're still
43836-
// referencing higher elements.
43837-
unsigned HalfElts = NumElts / 2;
43838-
unsigned HalfSize = SizeInBits / 2;
43839-
Mask.resize(HalfElts);
43840-
if (all_of(Mask,
43841-
[&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
43842-
MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
43843-
SDLoc DL(Op);
43844-
SDValue Ext;
43845-
SDValue M =
43846-
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
43847-
SDValue V =
43848-
extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
43849-
// For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
43850-
if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
43851-
Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
43852-
else
43853-
Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
43854-
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
43855-
Subtarget, TLO.DAG, DL, SizeInBits);
43856-
return TLO.CombineTo(Op, Insert);
43857-
}
43858-
}
43859-
break;
43860-
}
43861-
case X86ISD::VPERMV3: {
43862-
SmallVector<int, 16> Mask;
43863-
SmallVector<SDValue, 2> Ops;
43864-
if (Subtarget.hasVLX() &&
43865-
getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
43866-
// For lane-crossing shuffles, only split in half in case we're still
43867-
// referencing higher elements.
43868-
unsigned HalfElts = NumElts / 2;
43869-
unsigned HalfSize = SizeInBits / 2;
43870-
Mask.resize(HalfElts);
43871-
if (all_of(Mask, [&](int M) {
43872-
return isUndefOrInRange(M, 0, HalfElts) ||
43873-
isUndefOrInRange(M, NumElts, NumElts + HalfElts);
43874-
})) {
43875-
// Adjust mask elements for 2nd operand to point to half width.
43876-
for (int &M : Mask)
43877-
M = M <= NumElts ? M : (M - HalfElts);
43878-
MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
43879-
MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
43880-
SDLoc DL(Op);
43881-
SDValue Ext = TLO.DAG.getNode(
43882-
Opc, DL, HalfVT,
43883-
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
43884-
getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
43885-
extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
43886-
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
43887-
Subtarget, TLO.DAG, DL, SizeInBits);
43888-
return TLO.CombineTo(Op, Insert);
43889-
}
43890-
}
43891-
break;
43892-
}
4389343830
case X86ISD::VPERM2X128: {
4389443831
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
4389543832
SDLoc DL(Op);

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
749749
;
750750
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751751
; AVX512BW-SLOW: # %bb.0:
752-
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
753-
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
754-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
755-
; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
752+
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
753+
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754+
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
755+
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
756756
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
757757
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
758758
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
763763
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
764764
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765765
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
766-
; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
766+
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
767767
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
768768
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
769769
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
870870
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
871871
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872872
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
873-
; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
873+
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
874874
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
875875
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876876
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
883883
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
884884
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885885
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
886-
; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
886+
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
887887
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
888888
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889889
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
10001000
;
10011001
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
10021002
; AVX512BW: # %bb.0:
1003-
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
1004-
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
1005-
; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1006-
; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
1003+
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1004+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
1005+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1006+
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
10071007
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
10081008
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
10091009
; AVX512BW-NEXT: vzeroupper
@@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46104610
; AVX512F: # %bb.0:
46114611
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
46124612
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4613-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
46144613
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4615-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
4616-
; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
4614+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4615+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4616+
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
46174617
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46184618
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
46194619
; AVX512F-NEXT: vzeroupper
@@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46234623
; AVX512DQ: # %bb.0:
46244624
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
46254625
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4626-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
46274626
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4628-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
4629-
; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
4627+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4628+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4629+
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
46304630
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46314631
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
46324632
; AVX512DQ-NEXT: vzeroupper
@@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48684868
; AVX512F: # %bb.0:
48694869
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
48704870
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4871-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
48724871
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4873-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
4874-
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
4872+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4873+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4874+
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
48754875
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
48764876
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
48774877
; AVX512F-NEXT: vzeroupper
@@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48814881
; AVX512DQ: # %bb.0:
48824882
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
48834883
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4884-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
48854884
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4886-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
4887-
; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
4885+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4886+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4887+
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
48884888
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
48894889
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
48904890
; AVX512DQ-NEXT: vzeroupper

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
644644
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645645
; AVX512BW-SLOW: # %bb.0:
646646
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
647-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
647+
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
648648
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
649649
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
650650
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
653653
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
654654
; AVX512BW-FAST: # %bb.0:
655655
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
656-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
656+
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
657657
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
658658
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
659659
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
738738
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
739739
; AVX512BW-SLOW: # %bb.0:
740740
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
741-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
741+
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
742742
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
743743
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
744744
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
748748
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
749749
; AVX512BW-FAST: # %bb.0:
750750
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
751-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
751+
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
752752
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
753753
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
754754
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)

llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
11131113
;
11141114
; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
11151115
; AVX512VBMI-FAST: # %bb.0:
1116-
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
1117-
; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2
1116+
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
1117+
; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
11181118
; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
11191119
; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
11201120
; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
@@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
11241124
;
11251125
; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
11261126
; AVX512VBMI-SLOW: # %bb.0:
1127-
; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
1128-
; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2
1129-
; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2
1127+
; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
1128+
; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1129+
; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
11301130
; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
11311131
; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
11321132
; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
11331133
; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
1134-
; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0
1134+
; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
11351135
; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
11361136
; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
11371137
; AVX512VBMI-SLOW-NEXT: vzeroupper

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -593,104 +593,100 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
593593
; AVX512BW-LABEL: load_i16_stride5_vf4:
594594
; AVX512BW: # %bb.0:
595595
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596-
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
597-
; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
596+
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
597+
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
598598
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599-
; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
600-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
599+
; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
601600
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
602601
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
603-
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
602+
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
604603
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
605604
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
606-
; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
605+
; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
607606
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
608-
; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
607+
; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
609608
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
610-
; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
611-
; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
609+
; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
610+
; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
612611
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
613612
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
614613
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
615-
; AVX512BW-NEXT: vmovq %xmm2, (%r9)
614+
; AVX512BW-NEXT: vmovq %xmm1, (%r9)
616615
; AVX512BW-NEXT: vzeroupper
617616
; AVX512BW-NEXT: retq
618617
;
619618
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
620619
; AVX512BW-FCP: # %bb.0:
621620
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
622-
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
623-
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
621+
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
622+
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
624623
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
625-
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
626-
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
624+
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
627625
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
628626
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
629-
; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
627+
; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
630628
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
631629
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
632-
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
630+
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
633631
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
634-
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
632+
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
635633
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
636-
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
637-
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi)
634+
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
635+
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
638636
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
639637
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
640638
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
641-
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
639+
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
642640
; AVX512BW-FCP-NEXT: vzeroupper
643641
; AVX512BW-FCP-NEXT: retq
644642
;
645643
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
646644
; AVX512DQ-BW: # %bb.0:
647645
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
648-
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
649-
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
646+
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
647+
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
650648
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
651-
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
652-
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
649+
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
653650
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
654651
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
655-
; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
652+
; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
656653
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
657654
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
658-
; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
655+
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
659656
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
660-
; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
657+
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
661658
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
662-
; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
663-
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
659+
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
660+
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
664661
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
665662
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
666663
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
667-
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
664+
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9)
668665
; AVX512DQ-BW-NEXT: vzeroupper
669666
; AVX512DQ-BW-NEXT: retq
670667
;
671668
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
672669
; AVX512DQ-BW-FCP: # %bb.0:
673670
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
674-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
675-
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
671+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
672+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
676673
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
677-
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
678-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
674+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
679675
; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
680676
; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
681-
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
677+
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
682678
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
683679
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
684-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
680+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
685681
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
686-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
682+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
687683
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
688-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
689-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi)
684+
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
685+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
690686
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
691687
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
692688
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
693-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9)
689+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9)
694690
; AVX512DQ-BW-FCP-NEXT: vzeroupper
695691
; AVX512DQ-BW-FCP-NEXT: retq
696692
%wide.vec = load <20 x i16>, ptr %in.vec, align 64

0 commit comments

Comments
 (0)