Skip to content

Commit ed51a8c

Browse files
RKSimonjoaosaffran
authored andcommitted
[X86] Fold EXTRACT_SUBVECTOR(VPERMV(V,M),C) -> EXTRACT_SUBVECTOR(VPERMV(V,M'),0)
Similar to what we already do for VPERMV3, we can fold the non-zero-idx EXTRACT_SUBVECTOR into the shuffle mask and use a free EXTRACT_SUBVECTOR(V,0) instead
1 parent 0875d0f commit ed51a8c

File tree

3 files changed

+19
-16
lines changed

3 files changed

+19
-16
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58675,6 +58675,17 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5867558675
DAG.getTargetConstant(M, DL, MVT::i8));
5867658676
}
5867758677
break;
58678+
case X86ISD::VPERMV:
58679+
if (IdxVal != 0) {
58680+
SDValue Mask = InVec.getOperand(0);
58681+
SDValue Src = InVec.getOperand(1);
58682+
Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
58683+
Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
58684+
DL, InSizeInBits);
58685+
SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
58686+
return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
58687+
}
58688+
break;
5867858689
case X86ISD::VPERMV3:
5867958690
if (IdxVal != 0) {
5868058691
SDValue Src0 = InVec.getOperand(0);

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
242242
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
243243
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
244244
; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7
245-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
245+
; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
246246
; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0
247-
; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
248247
; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
249248
; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx)
250249
; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
@@ -308,9 +307,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
308307
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
309308
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
310309
; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7
311-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
310+
; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
312311
; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0
313-
; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
314312
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
315313
; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx)
316314
; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
@@ -374,9 +372,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
374372
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
375373
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
376374
; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7
377-
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
375+
; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
378376
; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0
379-
; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
380377
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
381378
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx)
382379
; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx)
@@ -440,9 +437,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
440437
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
441438
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
442439
; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7
443-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7]
440+
; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
444441
; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0
445-
; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
446442
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
447443
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx)
448444
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx)

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,9 +226,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
226226
; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4
227227
; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
228228
; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
229-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
229+
; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
230230
; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
231-
; AVX512-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
232231
; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
233232
; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
234233
; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -292,9 +291,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
292291
; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4
293292
; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
294293
; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
295-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
294+
; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
296295
; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
297-
; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
298296
; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
299297
; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
300298
; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -358,9 +356,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
358356
; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4
359357
; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
360358
; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
361-
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
359+
; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
362360
; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
363-
; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
364361
; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
365362
; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
366363
; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -424,9 +421,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
424421
; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4
425422
; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
426423
; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
427-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5]
424+
; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
428425
; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
429-
; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
430426
; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
431427
; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
432428
; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]

0 commit comments

Comments
 (0)