Skip to content

Commit 8aacbfc

Browse files
committed
[X86] combineEXTRACT_SUBVECTOR - treat oneuse extractions from loads as free
Allows further reductions in instruction vector widths
1 parent 0eb1fc8 commit 8aacbfc

File tree

6 files changed

+395
-397
lines changed

6 files changed

+395
-397
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57140,6 +57140,11 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5714057140
}
5714157141

5714257142
auto IsExtractFree = [](SDValue V) {
57143+
if (V.hasOneUse()) {
57144+
V = peekThroughOneUseBitcasts(V);
57145+
if (V.getOpcode() == ISD::LOAD)
57146+
return true;
57147+
}
5714357148
V = peekThroughBitcasts(V);
5714457149
if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
5714557150
return true;

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2749,12 +2749,9 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
27492749
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
27502750
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
27512751
; CHECK: # %bb.0:
2752-
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2753-
; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2754-
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
2752+
; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2
27552753
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2756-
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2757-
; CHECK-NEXT: vzeroupper
2754+
; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
27582755
; CHECK-NEXT: retq
27592756
%vec = load <8 x i64>, ptr %vp
27602757
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2766,12 +2763,9 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
27662763
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
27672764
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
27682765
; CHECK: # %bb.0:
2769-
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
2770-
; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2771-
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
2766+
; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1
27722767
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2773-
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2774-
; CHECK-NEXT: vzeroupper
2768+
; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
27752769
; CHECK-NEXT: retq
27762770
%vec = load <8 x i64>, ptr %vp
27772771
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 60 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2401,109 +2401,106 @@ define void @D107009(ptr %input, ptr %output) {
24012401
; AVX1-LABEL: D107009:
24022402
; AVX1: # %bb.0:
24032403
; AVX1-NEXT: vmovups 96(%rdi), %ymm0
2404-
; AVX1-NEXT: vmovups (%rdi), %ymm1
2405-
; AVX1-NEXT: vmovups 128(%rdi), %ymm2
2406-
; AVX1-NEXT: vmovups 224(%rdi), %ymm3
2407-
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
2408-
; AVX1-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
2409-
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4]
2404+
; AVX1-NEXT: vmovups 128(%rdi), %ymm1
2405+
; AVX1-NEXT: vmovups 224(%rdi), %ymm2
2406+
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
24102407
; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2411-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2408+
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
2409+
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
2410+
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
24122411
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
24132412
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
24142413
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2415-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2414+
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
24162415
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
2417-
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
2416+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
24182417
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
24192418
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
24202419
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
24212420
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
24222421
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2423-
; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
2424-
; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
2425-
; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
2426-
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
2427-
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
2422+
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7]
2423+
; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
2424+
; AVX1-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
2425+
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
2426+
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
24282427
; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
2429-
; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi)
2430-
; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi)
2428+
; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi)
2429+
; AVX1-NEXT: vmovdqa %xmm6, 112(%rsi)
24312430
; AVX1-NEXT: vmovups %ymm1, 128(%rsi)
2432-
; AVX1-NEXT: vmovupd %ymm5, 192(%rsi)
2433-
; AVX1-NEXT: vmovups %ymm4, 224(%rsi)
2434-
; AVX1-NEXT: vmovups %ymm3, 160(%rsi)
2431+
; AVX1-NEXT: vmovups %ymm5, 160(%rsi)
2432+
; AVX1-NEXT: vmovupd %ymm4, 192(%rsi)
2433+
; AVX1-NEXT: vmovupd %ymm3, 224(%rsi)
24352434
; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
24362435
; AVX1-NEXT: vzeroupper
24372436
; AVX1-NEXT: retq
24382437
;
24392438
; AVX2-LABEL: D107009:
24402439
; AVX2: # %bb.0:
2441-
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
2442-
; AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
2443-
; AVX2-NEXT: vmovdqu 128(%rdi), %ymm2
2444-
; AVX2-NEXT: vmovdqu 192(%rdi), %ymm3
2445-
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5]
2440+
; AVX2-NEXT: vmovdqu 64(%rdi), %ymm0
2441+
; AVX2-NEXT: vmovdqu 128(%rdi), %ymm1
2442+
; AVX2-NEXT: vmovdqu 192(%rdi), %ymm2
24462443
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
2447-
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2448-
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2449-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
24502444
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2451-
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
2452-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2453-
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2445+
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
2446+
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
2447+
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
2448+
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2449+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
2450+
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
2451+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
24542452
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
2455-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2456-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
2457-
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
2458-
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2459-
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,1,1,1,5,5,5,5]
2460-
; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[3,3,3,3,7,7,7,7]
2461-
; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7]
2453+
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2454+
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
2455+
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
2456+
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm4
2457+
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7]
2458+
; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7]
2459+
; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,1,5,5,5,5]
24622460
; AVX2-NEXT: vmovdqu %ymm0, 128(%rsi)
2463-
; AVX2-NEXT: vmovdqu %ymm7, 192(%rsi)
2464-
; AVX2-NEXT: vmovdqu %ymm6, 224(%rsi)
2465-
; AVX2-NEXT: vmovdqu %ymm5, 160(%rsi)
2466-
; AVX2-NEXT: vmovdqu %ymm4, 64(%rsi)
2467-
; AVX2-NEXT: vmovdqa %xmm3, 112(%rsi)
2468-
; AVX2-NEXT: vmovdqu %ymm2, (%rsi)
2469-
; AVX2-NEXT: vmovdqa %xmm1, 48(%rsi)
2461+
; AVX2-NEXT: vmovdqu %ymm7, 160(%rsi)
2462+
; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi)
2463+
; AVX2-NEXT: vmovdqu %ymm5, 224(%rsi)
2464+
; AVX2-NEXT: vmovdqu %ymm4, (%rsi)
2465+
; AVX2-NEXT: vmovdqa %xmm3, 48(%rsi)
2466+
; AVX2-NEXT: vmovdqa %xmm2, 112(%rsi)
2467+
; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi)
24702468
; AVX2-NEXT: vzeroupper
24712469
; AVX2-NEXT: retq
24722470
;
24732471
; XOP-LABEL: D107009:
24742472
; XOP: # %bb.0:
24752473
; XOP-NEXT: vmovups 96(%rdi), %ymm0
2476-
; XOP-NEXT: vmovups (%rdi), %ymm1
2477-
; XOP-NEXT: vmovups 128(%rdi), %ymm2
2478-
; XOP-NEXT: vmovups 224(%rdi), %ymm3
2479-
; XOP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
2480-
; XOP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
2481-
; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4]
2474+
; XOP-NEXT: vmovups 128(%rdi), %ymm1
2475+
; XOP-NEXT: vmovups 224(%rdi), %ymm2
2476+
; XOP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
24822477
; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2483-
; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
2478+
; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
2479+
; XOP-NEXT: vmovdqa 16(%rdi), %xmm2
2480+
; XOP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
24842481
; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
24852482
; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
24862483
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0
2487-
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2484+
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
24882485
; XOP-NEXT: vpsrld $16, %xmm0, %xmm0
2489-
; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1
2486+
; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
24902487
; XOP-NEXT: vpsrld $16, %xmm1, %xmm1
24912488
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
24922489
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
24932490
; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
24942491
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2495-
; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
2496-
; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
2497-
; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
2498-
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
2499-
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
2492+
; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7]
2493+
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
2494+
; XOP-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
2495+
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
2496+
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
25002497
; XOP-NEXT: vmovdqa %xmm0, 16(%rsi)
2501-
; XOP-NEXT: vmovdqa %xmm7, 112(%rsi)
2502-
; XOP-NEXT: vmovdqa %xmm6, 48(%rsi)
2498+
; XOP-NEXT: vmovdqa %xmm7, 48(%rsi)
2499+
; XOP-NEXT: vmovdqa %xmm6, 112(%rsi)
25032500
; XOP-NEXT: vmovups %ymm1, 128(%rsi)
2504-
; XOP-NEXT: vmovupd %ymm5, 192(%rsi)
2505-
; XOP-NEXT: vmovups %ymm4, 224(%rsi)
2506-
; XOP-NEXT: vmovups %ymm3, 160(%rsi)
2501+
; XOP-NEXT: vmovups %ymm5, 160(%rsi)
2502+
; XOP-NEXT: vmovupd %ymm4, 192(%rsi)
2503+
; XOP-NEXT: vmovupd %ymm3, 224(%rsi)
25072504
; XOP-NEXT: vmovups %ymm2, 64(%rsi)
25082505
; XOP-NEXT: vzeroupper
25092506
; XOP-NEXT: retq

0 commit comments

Comments
 (0)