Skip to content

Commit 70bd80d

Browse files
committed
[X86] combineTargetShuffle - commute VPERMV3 shuffles so any load is on the RHS
This helps ensure we lower to VPERMI2/T2 instructions that we can commute the index arg to VPERMT2/I2. Similar to 1e31a45 to handle cases where the one use load appears after further folding (keep the lowerShuffleWithPERMV version as this can handle the non-VLX widening case as well).
1 parent aa2d084 commit 70bd80d

File tree

3 files changed

+21
-13
lines changed

3 files changed

+21
-13
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42252,6 +42252,17 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4225242252
DAG.getIntPtrConstant(0, DL));
4225342253
}
4225442254
}
42255+
SmallVector<SDValue, 2> Ops;
42256+
SmallVector<int, 32> Mask;
42257+
if (isShuffleFoldableLoad(N.getOperand(0)) &&
42258+
!isShuffleFoldableLoad(N.getOperand(2)) &&
42259+
getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42260+
ShuffleVectorSDNode::commuteMask(Mask);
42261+
SDValue NewMask = getConstVector(
42262+
Mask, N.getOperand(1).getSimpleValueType(), DAG, DL, /*IsMask=*/true);
42263+
return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
42264+
N.getOperand(0));
42265+
}
4225542266
return SDValue();
4225642267
}
4225742268
default:

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3514,12 +3514,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x
35143514
; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
35153515
; CHECK-FAST-PERLANE: # %bb.0:
35163516
; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3517-
; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3518-
; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1]
3519-
; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
3517+
; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9]
3518+
; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
35203519
; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2
35213520
; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3522-
; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1}
3521+
; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm0 {%k1}
35233522
; CHECK-FAST-PERLANE-NEXT: retq
35243523
%vec = load <16 x float>, ptr %vp
35253524
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
@@ -3542,11 +3541,10 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8
35423541
; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
35433542
; CHECK-FAST-PERLANE: # %bb.0:
35443543
; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3545-
; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3546-
; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3547-
; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3548-
; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1
3549-
; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
3544+
; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9]
3545+
; CHECK-FAST-PERLANE-NEXT: vxorps %xmm3, %xmm3, %xmm3
3546+
; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3547+
; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
35503548
; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0
35513549
; CHECK-FAST-PERLANE-NEXT: retq
35523550
%vec = load <16 x float>, ptr %vp

llvm/test/CodeGen/X86/vector-shuffle-v1.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -719,10 +719,9 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
719719
; AVX512F: # %bb.0:
720720
; AVX512F-NEXT: kmovw %edi, %k1
721721
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
722-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
723-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
724-
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
725-
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
722+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,14,9,8,11,15,15,9]
723+
; AVX512F-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
724+
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
726725
; AVX512F-NEXT: kmovw %k0, %eax
727726
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
728727
; AVX512F-NEXT: vzeroupper

0 commit comments

Comments
 (0)