Skip to content

Commit b15e7f3

Browse files
authored
[X86] combineINSERT_SUBVECTOR - attempt to recursively shuffle combine if both base/sub-vectors are already shuffles (#130304)
1 parent 0377562 commit b15e7f3

File tree

3 files changed

+45
-72
lines changed

3 files changed

+45
-72
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58758,6 +58758,14 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5875858758
}
5875958759
}
5876058760

58761+
// Attempt to recursively combine to a shuffle.
58762+
if (isTargetShuffle(peekThroughBitcasts(Vec).getOpcode()) &&
58763+
isTargetShuffle(peekThroughBitcasts(SubVec).getOpcode())) {
58764+
SDValue Op(N, 0);
58765+
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
58766+
return Res;
58767+
}
58768+
5876158769
return SDValue();
5876258770
}
5876358771

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3519,9 +3519,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
35193519
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
35203520
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
35213521
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31]
3522-
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
3523-
; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3524-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3522+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
35253523
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
35263524
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
35273525
; AVX512BW-NEXT: vzeroupper
@@ -3651,9 +3649,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
36513649
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
36523650
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
36533651
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31]
3654-
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
3655-
; AVX512BW-NEXT: vpbroadcastw %xmm0, %xmm0
3656-
; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3652+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
36573653
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
36583654
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
36593655
; AVX512BW-NEXT: vzeroupper
@@ -3776,9 +3772,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37763772
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
37773773
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31]
37783774
; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
3779-
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
3780-
; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3781-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3775+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
37823776
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
37833777
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
37843778
; AVX512BW-NEXT: vzeroupper
@@ -3909,9 +3903,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
39093903
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
39103904
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
39113905
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31]
3912-
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
3913-
; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3914-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3906+
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
39153907
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
39163908
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
39173909
; AVX512BW-NEXT: vzeroupper
@@ -4401,10 +4393,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
44014393
; AVX512BW-SLOW: # %bb.0:
44024394
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
44034395
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4404-
; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4405-
; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4406-
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4407-
; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4396+
; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,14,0,0,0,0,0,0,0,1,1,0,0,0,0]
4397+
; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
44084398
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
44094399
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
44104400
; AVX512BW-SLOW-NEXT: vzeroupper

llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll

Lines changed: 31 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -3652,13 +3652,11 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
36523652
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
36533653
; AVX512BW-FAST: # %bb.0:
36543654
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3655-
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
36563655
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3657-
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3658-
; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3659-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
3660-
; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3661-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3656+
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
3657+
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0]
3658+
; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3659+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
36623660
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
36633661
; AVX512BW-FAST-NEXT: vzeroupper
36643662
; AVX512BW-FAST-NEXT: retq
@@ -3857,13 +3855,11 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
38573855
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
38583856
; AVX512BW-FAST: # %bb.0:
38593857
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3860-
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
38613858
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3862-
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3863-
; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3864-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3865-
; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3866-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3859+
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
3860+
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0]
3861+
; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3862+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
38673863
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
38683864
; AVX512BW-FAST-NEXT: vzeroupper
38693865
; AVX512BW-FAST-NEXT: retq
@@ -4085,13 +4081,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
40854081
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
40864082
; AVX512BW-FAST: # %bb.0:
40874083
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4088-
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
40894084
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4090-
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4091-
; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4092-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4093-
; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4094-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4085+
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4086+
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0]
4087+
; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
4088+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
40954089
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
40964090
; AVX512BW-FAST-NEXT: vzeroupper
40974091
; AVX512BW-FAST-NEXT: retq
@@ -4292,13 +4286,11 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
42924286
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
42934287
; AVX512BW-FAST: # %bb.0:
42944288
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4295-
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
42964289
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4297-
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4298-
; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4299-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4300-
; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4301-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4290+
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4291+
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,0,0,0,0,0,0,0,0]
4292+
; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
4293+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
43024294
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
43034295
; AVX512BW-FAST-NEXT: vzeroupper
43044296
; AVX512BW-FAST-NEXT: retq
@@ -5101,32 +5093,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
51015093
; AVX512DQ-NEXT: vzeroupper
51025094
; AVX512DQ-NEXT: retq
51035095
;
5104-
; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5105-
; AVX512BW-SLOW: # %bb.0:
5106-
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5107-
; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23]
5108-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5109-
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5110-
; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
5111-
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5112-
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5113-
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5114-
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5115-
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5116-
; AVX512BW-SLOW-NEXT: vzeroupper
5117-
; AVX512BW-SLOW-NEXT: retq
5118-
;
5119-
; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5120-
; AVX512BW-FAST: # %bb.0:
5121-
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
5122-
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5123-
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5124-
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5125-
; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5126-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5127-
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
5128-
; AVX512BW-FAST-NEXT: vzeroupper
5129-
; AVX512BW-FAST-NEXT: retq
5096+
; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5097+
; AVX512BW: # %bb.0:
5098+
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5099+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5100+
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5101+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5102+
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5103+
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5104+
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5105+
; AVX512BW-NEXT: vzeroupper
5106+
; AVX512BW-NEXT: retq
51305107
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
51315108
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
51325109
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5381,13 +5358,11 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
53815358
; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
53825359
; AVX512BW-SLOW: # %bb.0:
53835360
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5384-
; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11]
53855361
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5386-
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5387-
; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
5388-
; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5389-
; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5390-
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5362+
; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5363+
; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
5364+
; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5365+
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
53915366
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
53925367
; AVX512BW-SLOW-NEXT: vzeroupper
53935368
; AVX512BW-SLOW-NEXT: retq

0 commit comments

Comments
 (0)