Skip to content

Commit 6153582

Browse files
committed
[X86] combineX86ShuffleChainWithExtract - peek through insert_subvector(undef,vec,0) widening patterns when tracking subvector sources
Helps replace a number of X86ISD::VPERMV3 nodes that are shuffling subvectors from the same source with X86ISD::VPERMV equivalents.
1 parent d3532d1 commit 6153582

File tree

5 files changed

+259
-242
lines changed

5 files changed

+259
-242
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39710,13 +39710,23 @@ static SDValue combineX86ShuffleChainWithExtract(
3971039710
unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
3971139711
assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
3971239712

39713-
// Peek through extract_subvector to find widest legal vector.
39713+
// Peek through subvectors to find widest legal vector.
3971439714
// TODO: Handle ISD::TRUNCATE
3971539715
unsigned WideSizeInBits = RootSizeInBits;
39716-
for (unsigned I = 0; I != NumInputs; ++I) {
39717-
SDValue Input = peekThroughBitcasts(Inputs[I]);
39718-
while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
39719-
Input = peekThroughBitcasts(Input.getOperand(0));
39716+
for (SDValue Input : Inputs) {
39717+
Input = peekThroughBitcasts(Input);
39718+
while (1) {
39719+
if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
39720+
Input = peekThroughBitcasts(Input.getOperand(0));
39721+
continue;
39722+
}
39723+
if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
39724+
Input.getOperand(0).isUndef()) {
39725+
Input = peekThroughBitcasts(Input.getOperand(1));
39726+
continue;
39727+
}
39728+
break;
39729+
}
3972039730
if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
3972139731
WideSizeInBits < Input.getValueSizeInBits())
3972239732
WideSizeInBits = Input.getValueSizeInBits();
@@ -39744,21 +39754,32 @@ static SDValue combineX86ShuffleChainWithExtract(
3974439754
for (unsigned I = 0; I != NumInputs; ++I) {
3974539755
SDValue &Input = WideInputs[I];
3974639756
Input = peekThroughBitcasts(Input);
39747-
while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39748-
Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
39749-
uint64_t Idx = Input.getConstantOperandVal(1);
39750-
if (Idx != 0) {
39751-
++AdjustedMasks;
39752-
unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
39753-
Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
39754-
39755-
int lo = I * WideMask.size();
39756-
int hi = (I + 1) * WideMask.size();
39757-
for (int &M : WideMask)
39758-
if (lo <= M && M < hi)
39759-
M += Idx;
39760-
}
39761-
Input = peekThroughBitcasts(Input.getOperand(0));
39757+
while (1) {
39758+
if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39759+
Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
39760+
uint64_t Idx = Input.getConstantOperandVal(1);
39761+
if (Idx != 0) {
39762+
++AdjustedMasks;
39763+
unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
39764+
Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
39765+
39766+
int lo = I * WideMask.size();
39767+
int hi = (I + 1) * WideMask.size();
39768+
for (int &M : WideMask)
39769+
if (lo <= M && M < hi)
39770+
M += Idx;
39771+
}
39772+
Input = peekThroughBitcasts(Input.getOperand(0));
39773+
continue;
39774+
}
39775+
// TODO: Handle insertions into upper subvectors.
39776+
if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
39777+
Input.getOperand(0).isUndef() &&
39778+
isNullConstant(Input.getOperand(2))) {
39779+
Input = peekThroughBitcasts(Input.getOperand(1));
39780+
continue;
39781+
}
39782+
break;
3976239783
}
3976339784
}
3976439785

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,9 +2030,9 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
20302030
; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
20312031
; AVX512BW-FAST: # %bb.0:
20322032
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2033-
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31]
2033+
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
20342034
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2035-
; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0
2035+
; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
20362036
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
20372037
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
20382038
; AVX512BW-FAST-NEXT: vzeroupper

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2221,11 +2221,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
22212221
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
22222222
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
22232223
; CHECK: # %bb.0:
2224-
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2225-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,0,3,4]
2226-
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2224+
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4]
2225+
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
22272226
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2228-
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2227+
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
22292228
; CHECK-NEXT: retq
22302229
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
22312230
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2235,10 +2234,9 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
22352234
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
22362235
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
22372236
; CHECK: # %bb.0:
2238-
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2
2239-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4]
2237+
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,0,3,4]
22402238
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2241-
; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z}
2239+
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
22422240
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
22432241
; CHECK-NEXT: retq
22442242
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
@@ -4092,8 +4090,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
40924090
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
40934091
; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
40944092
; CHECK-FAST: # %bb.0:
4095-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8]
4096-
; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0
4093+
; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [5,0,7,0]
4094+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
40974095
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
40984096
; CHECK-FAST-NEXT: retq
40994097
;
@@ -4109,11 +4107,11 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
41094107
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
41104108
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
41114109
; CHECK-FAST: # %bb.0:
4112-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8]
4113-
; CHECK-FAST-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3
4114-
; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0
4115-
; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4116-
; CHECK-FAST-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
4110+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,0,7,0]
4111+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4112+
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4113+
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4114+
; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
41174115
; CHECK-FAST-NEXT: retq
41184116
;
41194117
; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
@@ -4134,10 +4132,10 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
41344132
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
41354133
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
41364134
; CHECK-FAST: # %bb.0:
4137-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8]
4135+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,0,7,0]
41384136
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
41394137
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4140-
; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z}
4138+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
41414139
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
41424140
; CHECK-FAST-NEXT: retq
41434141
;

0 commit comments

Comments
 (0)