-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Revert "[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded" #134256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
…ze of VP…" This reverts commit bf51609.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesFound a typo needs addressing - I'm going to revert and re-apply the patch with a fix Reverts llvm/llvm-project#133923 Patch is 42.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134256.diff 10 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d1be19539b642..546a2d22fa58e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43827,69 +43827,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
break;
}
- case X86ISD::VPERMV: {
- SmallVector<int, 16> Mask;
- SmallVector<SDValue, 2> Ops;
- if ((VT.is256BitVector() || Subtarget.hasVLX()) &&
- getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
- // For lane-crossing shuffles, only split in half in case we're still
- // referencing higher elements.
- unsigned HalfElts = NumElts / 2;
- unsigned HalfSize = SizeInBits / 2;
- Mask.resize(HalfElts);
- if (all_of(Mask,
- [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
- MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
- SDLoc DL(Op);
- SDValue Ext;
- SDValue M =
- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
- SDValue V =
- extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
- // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
- if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
- Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
- else
- Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
- SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
- Subtarget, TLO.DAG, DL, SizeInBits);
- return TLO.CombineTo(Op, Insert);
- }
- }
- break;
- }
- case X86ISD::VPERMV3: {
- SmallVector<int, 16> Mask;
- SmallVector<SDValue, 2> Ops;
- if (Subtarget.hasVLX() &&
- getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
- // For lane-crossing shuffles, only split in half in case we're still
- // referencing higher elements.
- unsigned HalfElts = NumElts / 2;
- unsigned HalfSize = SizeInBits / 2;
- Mask.resize(HalfElts);
- if (all_of(Mask, [&](int M) {
- return isUndefOrInRange(M, 0, HalfElts) ||
- isUndefOrInRange(M, NumElts, NumElts + HalfElts);
- })) {
- // Adjust mask elements for 2nd operand to point to half width.
- for (int &M : Mask)
- M = M <= NumElts ? M : (M - HalfElts);
- MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
- MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
- SDLoc DL(Op);
- SDValue Ext = TLO.DAG.getNode(
- Opc, DL, HalfVT,
- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
- getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
- extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
- SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
- Subtarget, TLO.DAG, DL, SizeInBits);
- return TLO.CombineTo(Op, Insert);
- }
- }
- break;
- }
case X86ISD::VPERM2X128: {
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
SDLoc DL(Op);
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index b075d48627b18..6f4e7abda8b00 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
;
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
+; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
+; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
;
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
-; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
+; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
-; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
+; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
@@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
-; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
+; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
-; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
+; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 61e122b1aba36..52f856befa130 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index a84466bc1ca1a..26af46263c0e2 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
-; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2
+; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
+; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
@@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
-; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2
-; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2
+; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
+; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0
+; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 9b19ec15c6f55..739e6e2369e36 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -593,104 +593,100 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
+; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-NEXT: vmovq %xmm1, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm...
[truncated]
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Found a typo in the VPERMV3 mask adjustment - I'm going to revert and re-apply the patch with a fix
Reverts #133923