Skip to content

[X86] IsElementEquivalent - add handling for ISD::BITCASTS from smaller vector elements #139741

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 23, 2025
22 changes: 16 additions & 6 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9987,19 +9987,29 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
MaskSize == (int)ExpectedOp.getNumOperands())
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
break;
case ISD::BITCAST:
if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
SDValue Src = peekThroughBitcasts(Op);
EVT SrcVT = Src.getValueType();
if (SrcVT.isVector() &&
(SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
case ISD::BITCAST: {
SDValue Src = peekThroughBitcasts(Op);
EVT SrcVT = Src.getValueType();
if (Op == ExpectedOp && SrcVT.isVector() &&
(int)VT.getVectorNumElements() == MaskSize) {
if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
return (Idx % Scale) == (ExpectedIdx % Scale) &&
IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
Idx / Scale, ExpectedIdx / Scale);
}
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
for (unsigned I = 0; I != Scale; ++I)
if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
(Idx * Scale) + I,
(ExpectedIdx * Scale) + I))
return false;
return true;
}
}
break;
}
case ISD::VECTOR_SHUFFLE: {
auto *SVN = cast<ShuffleVectorSDNode>(Op);
return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -532,10 +532,10 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3]
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
Expand Down
15 changes: 7 additions & 8 deletions llvm/test/CodeGen/X86/horizontal-sum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1],xmm6[1]
; SSSE3-SLOW-NEXT: movaps %xmm6, %xmm1
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
Expand Down Expand Up @@ -345,8 +345,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
; SSSE3-SLOW-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
Expand Down Expand Up @@ -374,7 +373,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
Expand All @@ -397,7 +396,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
Expand All @@ -422,7 +421,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
Expand All @@ -445,7 +444,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/vector-half-conversions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3138,10 +3138,10 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3]
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
Expand Down Expand Up @@ -3272,8 +3272,8 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
Expand Down Expand Up @@ -3404,8 +3404,8 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
Expand Down Expand Up @@ -4107,8 +4107,8 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: vmovdqa %xmm0, (%rbx)
; AVX512-NEXT: addq $64, %rsp
; AVX512-NEXT: popq %rbx
Expand Down
18 changes: 6 additions & 12 deletions llvm/test/CodeGen/X86/vector-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1569,25 +1569,19 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
}

define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
; X86-SSE2-LABEL: mul_v2i64_0_1:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: xorpd %xmm1, %xmm1
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems movsd is better

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you just referring to the extra move?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I mean movsd is better than unpckhpd

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm struggling to come up with a way DAG can handle this cleanly - these are v2i64 shuffles that the domain pass flipped later on, much later than the TwoAddr commutation fixes. Whats happened is the i686 targets are now able to see the v2i64 zero (legalised to v4i32 zero) and match as a v2i64 shuffle instead of falling back to a v2f64 bitcasted shuffle. Unless we can demonstrate that MOVSD/S isn't affected by domain crossing we don't have a good pass to fix this :(

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be clear, all that has happened is the i686 codegen now matches the x86_64 which has always had this int/fp domain behavior

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think 32-bit performance is not so important.

; X86-SSE2-NEXT: retl
; SSE2-LABEL: mul_v2i64_0_1:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; SSE4-LABEL: mul_v2i64_0_1:
; SSE4: # %bb.0:
; SSE4-NEXT: xorps %xmm1, %xmm1
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; SSE4-NEXT: ret{{[l|q]}}
;
; X64-SSE2-LABEL: mul_v2i64_0_1:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: xorps %xmm1, %xmm1
; X64-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; X64-SSE2-NEXT: movaps %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: mul_v2i64_0_1:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
Expand Down
34 changes: 23 additions & 11 deletions llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7065,17 +7065,29 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-SLOW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
Expand Down
Loading