Skip to content

Commit 505d35a

Browse files
authored
[X86] getFauxShuffleMask - relax one use limit for insert_subvector concat splat pattern (llvm#127981)
If we're splatting a subvector using a insert_subvector(insert_subvector(undef,sub,0),sub,c) pattern then permit multiuse of the sub as long as the insert_subvector nodes are the only users.
1 parent 92a3192 commit 505d35a

File tree

2 files changed

+25
-22
lines changed

2 files changed

+25
-22
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6128,10 +6128,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
61286128
SDValue Sub = N.getOperand(1);
61296129
EVT SubVT = Sub.getValueType();
61306130
unsigned NumSubElts = SubVT.getVectorNumElements();
6131+
uint64_t InsertIdx = N.getConstantOperandVal(2);
6132+
// Handle CONCAT(SUB0, SUB1).
6133+
// Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6134+
if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6135+
NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6136+
Src.getOperand(0).isUndef() &&
6137+
Src.getOperand(1).getValueType() == SubVT &&
6138+
Src.getConstantOperandVal(2) == 0 &&
6139+
SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6140+
for (int i = 0; i != (int)NumSubElts; ++i)
6141+
Mask.push_back(i);
6142+
for (int i = 0; i != (int)NumSubElts; ++i)
6143+
Mask.push_back(i + NumElts);
6144+
Ops.push_back(Src.getOperand(1));
6145+
Ops.push_back(Sub);
6146+
return true;
6147+
}
61316148
if (!N->isOnlyUserOf(Sub.getNode()))
61326149
return false;
61336150
SDValue SubBC = peekThroughBitcasts(Sub);
6134-
uint64_t InsertIdx = N.getConstantOperandVal(2);
61356151
// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
61366152
if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
61376153
SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
@@ -6154,21 +6170,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
61546170
Ops.push_back(SubBCSrc);
61556171
return true;
61566172
}
6157-
// Handle CONCAT(SUB0, SUB1).
6158-
// Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6159-
if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6160-
NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6161-
Src.getOperand(0).isUndef() &&
6162-
Src.getOperand(1).getValueType() == SubVT &&
6163-
Src.getConstantOperandVal(2) == 0) {
6164-
for (int i = 0; i != (int)NumSubElts; ++i)
6165-
Mask.push_back(i);
6166-
for (int i = 0; i != (int)NumSubElts; ++i)
6167-
Mask.push_back(i + NumElts);
6168-
Ops.push_back(Src.getOperand(1));
6169-
Ops.push_back(Sub);
6170-
return true;
6171-
}
61726173
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
61736174
SmallVector<int, 64> SubMask;
61746175
SmallVector<SDValue, 2> SubInputs;

llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6897,9 +6897,10 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
68976897
; AVX512BW: # %bb.0:
68986898
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
68996899
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6900-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6901-
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6902-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6900+
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
6901+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,4,21,22,23,0,25,26,27,4,29,30,31]
6902+
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
6903+
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
69036904
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
69046905
; AVX512BW-NEXT: vzeroupper
69056906
; AVX512BW-NEXT: retq
@@ -7098,9 +7099,10 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
70987099
; AVX512BW: # %bb.0:
70997100
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
71007101
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7101-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7102-
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7103-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7102+
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7103+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
7104+
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7105+
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
71047106
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
71057107
; AVX512BW-NEXT: vzeroupper
71067108
; AVX512BW-NEXT: retq

0 commit comments

Comments
 (0)