Skip to content

Commit 7457f51

Browse files
authored
[X86] Fold VPERMV3(X,M,Y) -> VPERMV(CONCAT(X,Y),WIDEN(M)) iff the CONCAT is free (#122485)
This extends the existing fold which concatenates X and Y if they are sequential subvectors extracted from the same source. By using combineConcatVectorOps we can recognise other patterns where X and Y can be concatenated for free (e.g. sequential loads, concatenating repeated instructions etc.), which allows the VPERMV3 fold to be a lot more aggressive. This required combineConcatVectorOps to be extended to fold the additional case of "concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> extract_subvector(x)", similar to the original VPERMV3 fold where "x" was larger than the concat result type. This also exposes more cases where we have repeated vector/subvector loads if they have multiple uses - e.g. where we're loading a ymm and the lo/hi xmm pairs independently - in the past we've always considered this to be relatively benign, but I'm not certain if we should now do more to keep these from splitting?
1 parent 019a902 commit 7457f51

22 files changed

+1919
-1972
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41701,6 +41701,11 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
4170141701
return SDValue();
4170241702
}
4170341703

41704+
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
41705+
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
41706+
TargetLowering::DAGCombinerInfo &DCI,
41707+
const X86Subtarget &Subtarget);
41708+
4170441709
/// Try to combine x86 target specific shuffles.
4170541710
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4170641711
SelectionDAG &DAG,
@@ -42401,32 +42406,27 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4240142406
return SDValue();
4240242407
}
4240342408
case X86ISD::VPERMV3: {
42404-
SDValue V1 = peekThroughBitcasts(N.getOperand(0));
42405-
SDValue V2 = peekThroughBitcasts(N.getOperand(2));
42406-
MVT SVT = V1.getSimpleValueType();
42407-
// Combine VPERMV3 to widened VPERMV if the two source operands are split
42408-
// from the same vector.
42409-
if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42410-
V1.getConstantOperandVal(1) == 0 &&
42411-
V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42412-
V2.getConstantOperandVal(1) == SVT.getVectorNumElements() &&
42413-
V1.getOperand(0) == V2.getOperand(0)) {
42414-
EVT NVT = V1.getOperand(0).getValueType();
42415-
if (NVT.is256BitVector() ||
42416-
(NVT.is512BitVector() && Subtarget.hasEVEX512())) {
42417-
MVT WideVT = MVT::getVectorVT(
42418-
VT.getScalarType(), NVT.getSizeInBits() / VT.getScalarSizeInBits());
42409+
// Combine VPERMV3 to widened VPERMV if the two source operands can be
42410+
// freely concatenated.
42411+
if (VT.is128BitVector() ||
42412+
(VT.is256BitVector() && Subtarget.useAVX512Regs())) {
42413+
SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42414+
MVT WideVT = VT.getDoubleNumVectorElementsVT();
42415+
if (SDValue ConcatSrc =
42416+
combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
4241942417
SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
4242042418
DL, WideVT.getSizeInBits());
42421-
SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask,
42422-
DAG.getBitcast(WideVT, V1.getOperand(0)));
42419+
SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
4242342420
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
4242442421
DAG.getIntPtrConstant(0, DL));
4242542422
}
4242642423
}
4242742424
SmallVector<SDValue, 2> Ops;
4242842425
SmallVector<int, 32> Mask;
4242942426
if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42427+
assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42428+
SDValue V1 = peekThroughBitcasts(N.getOperand(0));
42429+
SDValue V2 = peekThroughBitcasts(N.getOperand(2));
4243042430
MVT MaskVT = N.getOperand(1).getSimpleValueType();
4243142431
// Canonicalize to VPERMV if both sources are the same.
4243242432
if (V1 == V2) {
@@ -57369,10 +57369,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5736957369
Op0.getOperand(1));
5737057370
}
5737157371

57372-
// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
57373-
// Only concat of subvector high halves which vperm2x128 is best at.
5737457372
// TODO: This should go in combineX86ShufflesRecursively eventually.
57375-
if (VT.is256BitVector() && NumOps == 2) {
57373+
if (NumOps == 2) {
5737657374
SDValue Src0 = peekThroughBitcasts(Ops[0]);
5737757375
SDValue Src1 = peekThroughBitcasts(Ops[1]);
5737857376
if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -57381,14 +57379,25 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5738157379
EVT SrcVT1 = Src1.getOperand(0).getValueType();
5738257380
unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
5738357381
unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
57384-
if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
57382+
// concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
57383+
// Only concat of subvector high halves which vperm2x128 is best at.
57384+
if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
57385+
SrcVT1.is256BitVector() &&
5738557386
Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
5738657387
Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
5738757388
return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
5738857389
DAG.getBitcast(VT, Src0.getOperand(0)),
5738957390
DAG.getBitcast(VT, Src1.getOperand(0)),
5739057391
DAG.getTargetConstant(0x31, DL, MVT::i8));
5739157392
}
57393+
// concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
57394+
if (Src0.getOperand(0) == Src1.getOperand(0) &&
57395+
Src0.getConstantOperandAPInt(1) == 0 &&
57396+
Src1.getConstantOperandAPInt(1) ==
57397+
Src0.getValueType().getVectorNumElements()) {
57398+
return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG,
57399+
DL, VT.getSizeInBits()));
57400+
}
5739257401
}
5739357402
}
5739457403

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
13371337
;
13381338
; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
13391339
; AVX512BW: # %bb.0:
1340-
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1341-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
1342-
; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
1343-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
1340+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
1341+
; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
1342+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
13441343
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
13451344
; AVX512BW-NEXT: vzeroupper
13461345
; AVX512BW-NEXT: retq
@@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
17891788
;
17901789
; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
17911790
; AVX512F-FAST: # %bb.0:
1792-
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
1793-
; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
1794-
; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1
1795-
; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0
1791+
; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
1792+
; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0
1793+
; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
17961794
; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
17971795
; AVX512F-FAST-NEXT: vzeroupper
17981796
; AVX512F-FAST-NEXT: retq
@@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
18081806
;
18091807
; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
18101808
; AVX512DQ-FAST: # %bb.0:
1811-
; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
1812-
; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
1813-
; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1
1814-
; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0
1809+
; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
1810+
; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0
1811+
; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
18151812
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx)
18161813
; AVX512DQ-FAST-NEXT: vzeroupper
18171814
; AVX512DQ-FAST-NEXT: retq
@@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
18271824
;
18281825
; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
18291826
; AVX512BW-FAST: # %bb.0:
1830-
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
1831-
; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
1832-
; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1
1833-
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0
1827+
; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7]
1828+
; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0
1829+
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
18341830
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
18351831
; AVX512BW-FAST-NEXT: vzeroupper
18361832
; AVX512BW-FAST-NEXT: retq

0 commit comments

Comments
 (0)