Skip to content

Commit 83fbe67

Browse files
authored
[X86] combineX86ShufflesRecursively - iteratively peek through bitcasts to free subvector widening/narrowing sources. (#134701)
Generalizes the existing code to repeatedly peek though mixed bitcast/insert_subvector/extract_subvector chains to find the source of the shuffle operand.
1 parent 8521bd2 commit 83fbe67

11 files changed

+5212
-5460
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -41115,30 +41115,37 @@ static SDValue combineX86ShufflesRecursively(
4111541115
}
4111641116
}
4111741117

41118+
// Peek through any free bitcasts to insert_subvector vector widenings or
41119+
// extract_subvector nodes back to root size.
41120+
// TODO: Can resolveTargetShuffleInputsAndMask do some of this?
4111841121
for (auto [I, Op] : enumerate(Ops)) {
41119-
// Peek through vector widenings + set out of bounds mask indices to undef.
41120-
// TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41121-
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
41122-
isNullConstant(Op.getOperand(2))) {
41123-
Op = Op.getOperand(1);
41124-
unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41125-
int Lo = I * Mask.size();
41126-
int Hi = (I + 1) * Mask.size();
41127-
int NewHi = Lo + (Mask.size() / Scale);
41128-
for (int &M : Mask) {
41129-
if (Lo <= M && NewHi <= M && M < Hi)
41130-
M = SM_SentinelUndef;
41131-
}
41132-
}
41133-
41134-
// Peek through any free bitcasts/extract_subvector nodes back to root size.
4113541122
SDValue BC = Op;
41136-
if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse())
41137-
BC = peekThroughOneUseBitcasts(BC);
41138-
while (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41139-
(RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41140-
isNullConstant(BC.getOperand(1))) {
41141-
Op = BC = BC.getOperand(0);
41123+
while (1) {
41124+
if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41125+
BC = BC.getOperand(0);
41126+
continue;
41127+
}
41128+
if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41129+
BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41130+
// Set out of bounds mask indices to undef.
41131+
Op = BC = BC.getOperand(1);
41132+
unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41133+
int Lo = I * Mask.size();
41134+
int Hi = (I + 1) * Mask.size();
41135+
int NewHi = Lo + (Mask.size() / Scale);
41136+
for (int &M : Mask) {
41137+
if (Lo <= M && NewHi <= M && M < Hi)
41138+
M = SM_SentinelUndef;
41139+
}
41140+
continue;
41141+
}
41142+
if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41143+
(RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41144+
isNullConstant(BC.getOperand(1))) {
41145+
Op = BC = BC.getOperand(0);
41146+
continue;
41147+
}
41148+
break;
4114241149
}
4114341150
}
4114441151

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4708,18 +4708,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
47084708
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
47094709
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
47104710
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4711-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
4712-
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
4713-
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
4714-
; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
4711+
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
47154712
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
47164713
; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
4717-
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4718-
; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4714+
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
4715+
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4716+
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4717+
; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
47194718
; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
47204719
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4721-
; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
4722-
; AVX-NEXT: vzeroupper
47234720
; AVX-NEXT: retq
47244721
;
47254722
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1836,7 +1836,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
18361836
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
18371837
; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10
18381838
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1839-
; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
1839+
; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12
18401840
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
18411841
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
18421842
; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1858,7 +1858,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
18581858
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
18591859
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
18601860
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1861-
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1861+
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
18621862
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
18631863
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
18641864
; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -1914,7 +1914,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
19141914
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
19151915
; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
19161916
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1917-
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
1917+
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
19181918
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
19191919
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
19201920
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1936,7 +1936,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
19361936
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
19371937
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
19381938
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1939-
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1939+
; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
19401940
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
19411941
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
19421942
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -1992,7 +1992,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
19921992
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
19931993
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10
19941994
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1995-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
1995+
; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12
19961996
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
19971997
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
19981998
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2014,7 +2014,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
20142014
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
20152015
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
20162016
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
2017-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
2017+
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
20182018
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
20192019
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
20202020
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -2070,7 +2070,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
20702070
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
20712071
; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
20722072
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
2073-
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
2073+
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
20742074
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
20752075
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
20762076
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2092,7 +2092,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
20922092
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
20932093
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
20942094
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
2095-
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
2095+
; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
20962096
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
20972097
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
20982098
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))

0 commit comments

Comments
 (0)