Skip to content

Commit a03f064

Browse files
committed
[X86] combineX86ShufflesRecursively - peek through one use bitcasts to find additional (free) extract_subvector nodes
1 parent f00b32e commit a03f064

File tree

3 files changed

+34
-54
lines changed

3 files changed

+34
-54
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41084,12 +41084,17 @@ static SDValue combineX86ShufflesRecursively(
4108441084
}
4108541085
}
4108641086

41087-
// Peek through any free extract_subvector nodes back to root size.
41088-
for (SDValue &Op : Ops)
41089-
while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41090-
(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41091-
isNullConstant(Op.getOperand(1)))
41092-
Op = Op.getOperand(0);
41087+
// Peek through any free bitcasts/extract_subvector nodes back to root size.
41088+
for (SDValue &Op : Ops){
41089+
SDValue BC = Op;
41090+
if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse())
41091+
BC = peekThroughOneUseBitcasts(BC);
41092+
while (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41093+
(RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41094+
isNullConstant(BC.getOperand(1))) {
41095+
Op = BC = BC.getOperand(0);
41096+
}
41097+
}
4109341098

4109441099
// Remove unused/repeated shuffle source ops.
4109541100
resolveTargetShuffleInputsAndMask(Ops, Mask);

llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll

Lines changed: 19 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12867,46 +12867,25 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
1286712867
; AVX512DQ-NEXT: vzeroupper
1286812868
; AVX512DQ-NEXT: retq
1286912869
;
12870-
; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8:
12871-
; AVX512BW-ONLY: # %bb.0:
12872-
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
12873-
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
12874-
; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0
12875-
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
12876-
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
12877-
; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
12878-
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
12879-
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
12880-
; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
12881-
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12882-
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
12883-
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12884-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12885-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12886-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
12887-
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12888-
; AVX512BW-ONLY-NEXT: vzeroupper
12889-
; AVX512BW-ONLY-NEXT: retq
12890-
;
12891-
; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8:
12892-
; AVX512VBMI-ONLY: # %bb.0:
12893-
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
12894-
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
12895-
; AVX512VBMI-ONLY-NEXT: vpmovsxbq %xmm0, %zmm0
12896-
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
12897-
; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
12898-
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
12899-
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
12900-
; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
12901-
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12902-
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
12903-
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12904-
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12905-
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12906-
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
12907-
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12908-
; AVX512VBMI-ONLY-NEXT: vzeroupper
12909-
; AVX512VBMI-ONLY-NEXT: retq
12870+
; AVX512BW-LABEL: mask_replication_factor8_vf8:
12871+
; AVX512BW: # %bb.0:
12872+
; AVX512BW-NEXT: kmovq (%rdi), %k0
12873+
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
12874+
; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0
12875+
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
12876+
; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
12877+
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
12878+
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
12879+
; AVX512BW-NEXT: kshiftrq $48, %k1, %k2
12880+
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12881+
; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
12882+
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12883+
; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12884+
; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12885+
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
12886+
; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12887+
; AVX512BW-NEXT: vzeroupper
12888+
; AVX512BW-NEXT: retq
1291012889
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
1291112890
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1291212891
%tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>

llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5117,15 +5117,11 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
51175117
; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
51185118
; AVX512BW-FAST: # %bb.0:
51195119
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
5120-
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
51215120
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5122-
; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
5123-
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
5124-
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5125-
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
5126-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5127-
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5128-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5121+
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5122+
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5123+
; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5124+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
51295125
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
51305126
; AVX512BW-FAST-NEXT: vzeroupper
51315127
; AVX512BW-FAST-NEXT: retq

0 commit comments

Comments
 (0)