Skip to content

Commit 42ff31a

Browse files
authored
[X86] combineTargetShuffle - fold VPERMV3(HI,MASK,LO) -> VPERMV(COMMUTE(MASK),CONCAT(LO,HI)) (#127199)
We already handle the simpler VPERMV3(LO,MASK,HI) fold which can reuse the (widened) mask, this attempts to match the flipped concatenation, and commutes the mask to handle the flip. I've limited this to cases where we can extract the constant mask for commutation, a more general solution would XOR the MSB of the shuffle mask indices to commute, but this almost never constant folds away after lowering so the benefit was minimal.
1 parent 2db2628 commit 42ff31a

9 files changed

+439
-468
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42513,10 +42513,12 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4251342513
case X86ISD::VPERMV3: {
4251442514
// Combine VPERMV3 to widened VPERMV if the two source operands can be
4251542515
// freely concatenated.
42516-
if (VT.is128BitVector() ||
42517-
(VT.is256BitVector() && Subtarget.useAVX512Regs())) {
42516+
MVT WideVT = VT.getDoubleNumVectorElementsVT();
42517+
MVT MaskVT = N.getOperand(1).getSimpleValueType();
42518+
bool CanConcat = VT.is128BitVector() ||
42519+
(VT.is256BitVector() && Subtarget.useAVX512Regs());
42520+
if (CanConcat) {
4251842521
SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42519-
MVT WideVT = VT.getDoubleNumVectorElementsVT();
4252042522
if (SDValue ConcatSrc =
4252142523
combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
4252242524
SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
@@ -42530,9 +42532,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4253042532
SmallVector<int, 32> Mask;
4253142533
if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
4253242534
assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42535+
// See if we can concatenate the commuted operands.
42536+
if (CanConcat) {
42537+
if (SDValue ConcatSrc = combineConcatVectorOps(
42538+
DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG, DCI,
42539+
Subtarget)) {
42540+
ShuffleVectorSDNode::commuteMask(Mask);
42541+
SDValue NewMask =
42542+
getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true);
42543+
NewMask = widenSubVector(NewMask, false, Subtarget, DAG, DL,
42544+
WideVT.getSizeInBits());
42545+
SDValue Perm =
42546+
DAG.getNode(X86ISD::VPERMV, DL, WideVT, NewMask, ConcatSrc);
42547+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42548+
DAG.getVectorIdxConstant(0, DL));
42549+
}
42550+
}
4253342551
SDValue V1 = peekThroughBitcasts(N.getOperand(0));
4253442552
SDValue V2 = peekThroughBitcasts(N.getOperand(2));
42535-
MVT MaskVT = N.getOperand(1).getSimpleValueType();
4253642553
// Canonicalize to VPERMV if both sources are the same.
4253742554
if (V1 == V2) {
4253842555
for (int &M : Mask)

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17541754
; AVX512BW: # %bb.0:
17551755
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
17561756
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1757-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1758-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
1759-
; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
1760-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1757+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
1758+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
1759+
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
17611760
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
17621761
; AVX512BW-NEXT: vzeroupper
17631762
; AVX512BW-NEXT: retq
@@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
18701869
; AVX512BW: # %bb.0:
18711870
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
18721871
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1873-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1874-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
1875-
; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
1876-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1872+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31]
1873+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
1874+
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
18771875
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
18781876
; AVX512BW-NEXT: vzeroupper
18791877
; AVX512BW-NEXT: retq
@@ -3776,12 +3774,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37763774
; AVX512BW: # %bb.0:
37773775
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
37783776
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3779-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3780-
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15]
3781-
; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
3782-
; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
3777+
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31]
3778+
; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
3779+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
37833780
; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3784-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3781+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
37853782
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
37863783
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
37873784
; AVX512BW-NEXT: vzeroupper
@@ -3911,11 +3908,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
39113908
; AVX512BW: # %bb.0:
39123909
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
39133910
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3914-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3915-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0]
3916-
; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
3911+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,0,0,0,0,0,0,0,0]
3912+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
39173913
; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3918-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3914+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
39193915
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
39203916
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
39213917
; AVX512BW-NEXT: vzeroupper
@@ -4037,11 +4033,10 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
40374033
; AVX512BW: # %bb.0:
40384034
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
40394035
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4040-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4041-
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15]
4042-
; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
4043-
; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
4044-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
4036+
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31]
4037+
; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
4038+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
4039+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
40454040
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
40464041
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
40474042
; AVX512BW-NEXT: vzeroupper
@@ -4151,10 +4146,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
41514146
; AVX512BW: # %bb.0:
41524147
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
41534148
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4154-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4155-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0]
4156-
; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
4157-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
4149+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,0,0,0,0,0,0,0]
4150+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
4151+
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
41584152
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
41594153
; AVX512BW-NEXT: vzeroupper
41604154
; AVX512BW-NEXT: retq

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
14151415
;
14161416
; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
14171417
; AVX512BW: # %bb.0:
1418-
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0
1419-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
1420-
; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1
1421-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
1418+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
1419+
; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
1420+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
14221421
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
14231422
; AVX512BW-NEXT: vzeroupper
14241423
; AVX512BW-NEXT: retq
@@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
15021501
;
15031502
; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
15041503
; AVX512BW: # %bb.0:
1505-
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0
1506-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
1507-
; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1
1508-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
1504+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31]
1505+
; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
1506+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
15091507
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
15101508
; AVX512BW-NEXT: vzeroupper
15111509
; AVX512BW-NEXT: retq

0 commit comments

Comments
 (0)