-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded (REAPPLIED) #134263
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ERMV/VPERMV3 nodes if the upper elements are not demanded (REAPPLIED) With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements. Reapplied version of llvm#133923 with fix for typo in the VPERMV3 mask adjustment
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesWith AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements. Reapplied version of #133923 with fix for typo in the VPERMV3 mask adjustment Patch is 42.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134263.diff 10 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 546a2d22fa58e..d2d022ab52c41 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43827,6 +43827,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
break;
}
+ case X86ISD::VPERMV: {
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if ((VT.is256BitVector() || Subtarget.hasVLX()) &&
+ getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
+ // For lane-crossing shuffles, only split in half in case we're still
+ // referencing higher elements.
+ unsigned HalfElts = NumElts / 2;
+ unsigned HalfSize = SizeInBits / 2;
+ Mask.resize(HalfElts);
+ if (all_of(Mask,
+ [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
+ MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
+ SDLoc DL(Op);
+ SDValue Ext;
+ SDValue M =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
+ SDValue V =
+ extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
+ // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
+ if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
+ Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
+ else
+ Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
+ SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
+ Subtarget, TLO.DAG, DL, SizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
+ case X86ISD::VPERMV3: {
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (Subtarget.hasVLX() &&
+ getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
+ // For lane-crossing shuffles, only split in half in case we're still
+ // referencing higher elements.
+ unsigned HalfElts = NumElts / 2;
+ unsigned HalfSize = SizeInBits / 2;
+ Mask.resize(HalfElts);
+ if (all_of(Mask, [&](int M) {
+ return isUndefOrInRange(M, 0, HalfElts) ||
+ isUndefOrInRange(M, NumElts, NumElts + HalfElts);
+ })) {
+ // Adjust mask elements for 2nd operand to point to half width.
+ for (int &M : Mask)
+ M = (M < NumElts) ? M : (M - HalfElts);
+ MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
+ MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
+ SDLoc DL(Op);
+ SDValue Ext = TLO.DAG.getNode(
+ Opc, DL, HalfVT,
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
+ getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
+ extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
+ SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
+ Subtarget, TLO.DAG, DL, SizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
case X86ISD::VPERM2X128: {
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
SDLoc DL(Op);
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 6f4e7abda8b00..b075d48627b18 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
;
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
;
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
-; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
+; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
-; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
+; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
@@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
-; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 52f856befa130..61e122b1aba36 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 26af46263c0e2..a84466bc1ca1a 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
-; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
+; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2
; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
@@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
-; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
+; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
+; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2
+; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2
; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
+; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0
; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 739e6e2369e36..9b19ec15c6f55 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi...
[truncated]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/92/builds/16572 Here is the relevant piece of the build log for the reference
|
With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements.
Reapplied version of #133923 with fix for typo in the VPERMV3 mask adjustment