Skip to content

Commit d7f3775

Browse files
committed
[X86] combineEXTRACT_SUBVECTOR - fold extract_subvector(pshufd(v,i)) -> pshufd(extract_subvector(v,i))
Attempt to avoid unnecessary wide PSHUFD or VPERMILIPS/D instructions by pre-extracting the subvector source if thats its only use.
1 parent 7bf3137 commit d7f3775

File tree

6 files changed

+3286
-3414
lines changed

6 files changed

+3286
-3414
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58339,6 +58339,17 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5833958339
return DAG.getNode(
5834058340
InOpcode, DL, VT,
5834158341
extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
58342+
case X86ISD::PSHUFD:
58343+
case X86ISD::VPERMILPI:
58344+
if (InVec.getOperand(0).hasOneUse()) {
58345+
uint64_t M = InVec.getConstantOperandVal(1) & 255;
58346+
M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
58347+
return DAG.getNode(InOpcode, DL, VT,
58348+
extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58349+
DL, SizeInBits),
58350+
DAG.getTargetConstant(M, DL, MVT::i8));
58351+
}
58352+
break;
5834258353
case X86ISD::PCMPEQ:
5834358354
case X86ISD::PCMPGT:
5834458355
case X86ISD::UNPCKH:

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 42 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2400,34 +2400,32 @@ define void @D107009(ptr %input, ptr %output) {
24002400
;
24012401
; AVX1-LABEL: D107009:
24022402
; AVX1: # %bb.0:
2403-
; AVX1-NEXT: vmovups 96(%rdi), %ymm0
2404-
; AVX1-NEXT: vmovups 128(%rdi), %ymm1
2405-
; AVX1-NEXT: vmovups 224(%rdi), %ymm2
2406-
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2407-
; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2408-
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
2409-
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
2410-
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
2411-
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2412-
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
2403+
; AVX1-NEXT: vmovups 128(%rdi), %ymm0
2404+
; AVX1-NEXT: vmovups 224(%rdi), %ymm1
2405+
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2406+
; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2407+
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,0],ymm0[4,5],ymm1[6,4]
2408+
; AVX1-NEXT: vmovaps 112(%rdi), %xmm1
2409+
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
2410+
; AVX1-NEXT: vmovaps 16(%rdi), %xmm2
2411+
; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
2412+
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
2413+
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
24132414
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2414-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
24152415
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
2416-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2417-
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
2418-
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2419-
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2420-
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
2416+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2417+
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
2418+
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
24212419
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2422-
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7]
2423-
; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
2424-
; AVX1-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
2425-
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
2426-
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
2427-
; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
2420+
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3,3,3,7,7,7,7]
2421+
; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0,0,3,2]
2422+
; AVX1-NEXT: vmovshdup {{.*#+}} ymm5 = ymm0[1,1,3,3,5,5,7,7]
2423+
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
2424+
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
2425+
; AVX1-NEXT: vmovdqa %xmm1, 16(%rsi)
24282426
; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi)
24292427
; AVX1-NEXT: vmovdqa %xmm6, 112(%rsi)
2430-
; AVX1-NEXT: vmovups %ymm1, 128(%rsi)
2428+
; AVX1-NEXT: vmovups %ymm0, 128(%rsi)
24312429
; AVX1-NEXT: vmovups %ymm5, 160(%rsi)
24322430
; AVX1-NEXT: vmovupd %ymm4, 192(%rsi)
24332431
; AVX1-NEXT: vmovupd %ymm3, 224(%rsi)
@@ -2470,34 +2468,32 @@ define void @D107009(ptr %input, ptr %output) {
24702468
;
24712469
; XOP-LABEL: D107009:
24722470
; XOP: # %bb.0:
2473-
; XOP-NEXT: vmovups 96(%rdi), %ymm0
2474-
; XOP-NEXT: vmovups 128(%rdi), %ymm1
2475-
; XOP-NEXT: vmovups 224(%rdi), %ymm2
2476-
; XOP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2477-
; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2478-
; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
2479-
; XOP-NEXT: vmovdqa 16(%rdi), %xmm2
2480-
; XOP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
2481-
; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2482-
; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
2471+
; XOP-NEXT: vmovups 128(%rdi), %ymm0
2472+
; XOP-NEXT: vmovups 224(%rdi), %ymm1
2473+
; XOP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2474+
; XOP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2475+
; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,0],ymm0[4,5],ymm1[6,4]
2476+
; XOP-NEXT: vmovaps 112(%rdi), %xmm1
2477+
; XOP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
2478+
; XOP-NEXT: vmovaps 16(%rdi), %xmm2
2479+
; XOP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
2480+
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
2481+
; XOP-NEXT: vpsrld $16, %xmm1, %xmm1
24832482
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0
2484-
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
24852483
; XOP-NEXT: vpsrld $16, %xmm0, %xmm0
2486-
; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
2487-
; XOP-NEXT: vpsrld $16, %xmm1, %xmm1
2488-
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2489-
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2490-
; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
2484+
; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2485+
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
2486+
; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
24912487
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2492-
; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7]
2493-
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
2494-
; XOP-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
2495-
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
2496-
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
2497-
; XOP-NEXT: vmovdqa %xmm0, 16(%rsi)
2488+
; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3,3,3,7,7,7,7]
2489+
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0,0,3,2]
2490+
; XOP-NEXT: vmovshdup {{.*#+}} ymm5 = ymm0[1,1,3,3,5,5,7,7]
2491+
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
2492+
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
2493+
; XOP-NEXT: vmovdqa %xmm1, 16(%rsi)
24982494
; XOP-NEXT: vmovdqa %xmm7, 48(%rsi)
24992495
; XOP-NEXT: vmovdqa %xmm6, 112(%rsi)
2500-
; XOP-NEXT: vmovups %ymm1, 128(%rsi)
2496+
; XOP-NEXT: vmovups %ymm0, 128(%rsi)
25012497
; XOP-NEXT: vmovups %ymm5, 160(%rsi)
25022498
; XOP-NEXT: vmovupd %ymm4, 192(%rsi)
25032499
; XOP-NEXT: vmovupd %ymm3, 224(%rsi)

0 commit comments

Comments
 (0)