-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] IsElementEquivalent - add handling for ISD::BITCASTS from smaller vector elements #139741
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…er vector elements Check if all smaller aliased source elements are equivalent
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesCheck if all smaller aliased source elements are equivalent Full diff: https://github.com/llvm/llvm-project/pull/139741.diff 8 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e3bb5db07ac40..5514b1e211fd9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9988,19 +9988,29 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
MaskSize == (int)ExpectedOp.getNumOperands())
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
break;
- case ISD::BITCAST:
- if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
- SDValue Src = peekThroughBitcasts(Op);
- EVT SrcVT = Src.getValueType();
- if (SrcVT.isVector() &&
- (SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
+ case ISD::BITCAST: {
+ SDValue Src = peekThroughBitcasts(Op);
+ EVT SrcVT = Src.getValueType();
+ if (Op == ExpectedOp && SrcVT.isVector() &&
+ (int)VT.getVectorNumElements() == MaskSize) {
+ if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
return (Idx % Scale) == (ExpectedIdx % Scale) &&
IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
Idx / Scale, ExpectedIdx / Scale);
}
+ if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
+ unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+ bool AllEquiv = true;
+ for (unsigned I = 0; I != Scale && AllEquiv; ++I)
+ AllEquiv &=
+ IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
+ (Idx * Scale) + I, (ExpectedIdx * Scale) + I);
+ return AllEquiv;
+ }
}
break;
+ }
case ISD::VECTOR_SHUFFLE: {
auto *SVN = cast<ShuffleVectorSDNode>(Op);
return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index 7237b02ca6b66..74267786b3cbf 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -92,7 +92,7 @@ define <4 x float> @concat_of_broadcast_v4f32_v8f32(ptr %a0, ptr %a1, ptr %a2) {
; AVX1-NEXT: vmovaps (%rdx), %ymm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,0],xmm0[0,0]
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 632f3c6c1e851..2609b06361af5 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -532,10 +532,10 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3]
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index e2cc3ae0dca0a..443275e11459d 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -179,8 +179,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
-; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1],xmm6[1]
+; SSSE3-SLOW-NEXT: movaps %xmm6, %xmm1
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -345,8 +345,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
-; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
+; SSSE3-SLOW-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
@@ -374,7 +373,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
@@ -397,7 +396,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
@@ -422,7 +421,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
@@ -445,7 +444,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 8510b1031d717..1bbf92e45fc6c 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3138,10 +3138,10 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3]
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
@@ -3272,8 +3272,8 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
@@ -3404,8 +3404,8 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
@@ -4107,8 +4107,8 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: vmovdqa %xmm0, (%rbx)
; AVX512-NEXT: addq $64, %rsp
; AVX512-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 62e2aadd818c1..57471e2f75d71 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -487,10 +487,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1]
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm11[2,0],ymm9[6,5],ymm11[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4],ymm10[5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm10
; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
; AVX-NEXT: vbroadcastss (%r10), %ymm10
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
; AVX-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,0],xmm3[0,0]
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0]
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index a166bebae721c..98b5bab98c4f9 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -1569,11 +1569,12 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
}
define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
-; X86-SSE2-LABEL: mul_v2i64_0_1:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: xorpd %xmm1, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE2-NEXT: retl
+; SSE2-LABEL: mul_v2i64_0_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: ret{{[l|q]}}
;
; SSE4-LABEL: mul_v2i64_0_1:
; SSE4: # %bb.0:
@@ -1581,13 +1582,6 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; SSE4-NEXT: ret{{[l|q]}}
;
-; X64-SSE2-LABEL: mul_v2i64_0_1:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: xorps %xmm1, %xmm1
-; X64-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; X64-SSE2-NEXT: movaps %xmm1, %xmm0
-; X64-SSE2-NEXT: retq
-;
; X64-AVX-LABEL: mul_v2i64_0_1:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ea0e3b3a2b9aa..c6a4d8b759c9b 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -7068,17 +7068,29 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
-; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
+; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
|
; X86-SSE2-LABEL: mul_v2i64_0_1: | ||
; X86-SSE2: # %bb.0: | ||
; X86-SSE2-NEXT: xorpd %xmm1, %xmm1 | ||
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems movsd
is better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you just referring to the extra move?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I mean movsd
is better than unpckhpd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm struggling to come up with a way DAG can handle this cleanly - these are v2i64 shuffles that the domain pass flipped later on, much later than the TwoAddr commutation fixes. Whats happened is the i686 targets are now able to see the v2i64 zero (legalised to v4i32 zero) and match as a v2i64 shuffle instead of falling back to a v2f64 bitcasted shuffle. Unless we can demonstrate that MOVSD/S isn't affected by domain crossing we don't have a good pass to fix this :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to be clear, all that has happened is the i686 codegen now matches the x86_64 which has always had this int/fp domain behavior
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I think 32-bit performance is not so important.
…entify hidden splat/broadcasts Noticed while yak shaving llvm#139741
Same matching order as other 128/256-bit shuffles Fixes regression identified in llvm#139741
# Conflicts: # llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
apart from the movsd issue on 32-bit targets (which I'm not sure what to do tbh), I've now addressed the remaining regressions |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
; X86-SSE2-LABEL: mul_v2i64_0_1: | ||
; X86-SSE2: # %bb.0: | ||
; X86-SSE2-NEXT: xorpd %xmm1, %xmm1 | ||
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I think 32-bit performance is not so important.
…entify hidden splat/broadcasts (llvm#141035) Noticed while yak shaving llvm#139741
…lvm#141073) Use the same matching order as other 128/256-bit shuffles Fixes regression identified in llvm#139741
…er vector elements (llvm#139741) Check if all smaller aliased source elements are equivalent
…entify hidden splat/broadcasts (llvm#141035) Noticed while yak shaving llvm#139741
…lvm#141073) Use the same matching order as other 128/256-bit shuffles Fixes regression identified in llvm#139741
Check if all smaller aliased source elements are equivalent