Skip to content

Commit 4b529f8

Browse files
committed
[X86] Fold extractsubvector(permv3(src0,mask,src1),c) -> extractsubvector(permv3(src0,widensubvector(extractsubvector(mask,c)),src1),0) iff c != 0
For cross-lane shuffles, extract the mask operand (uppper) subvector directly, and make use of the free implicit extraction of the lowest subvector of the result.
1 parent adf02ae commit 4b529f8

File tree

3 files changed

+29
-40
lines changed

3 files changed

+29
-40
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57791,6 +57791,19 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5779157791
DAG.getTargetConstant(M, DL, MVT::i8));
5779257792
}
5779357793
break;
57794+
case X86ISD::VPERMV3:
57795+
if (IdxVal != 0) {
57796+
SDValue Src0 = InVec.getOperand(0);
57797+
SDValue Mask = InVec.getOperand(1);
57798+
SDValue Src1 = InVec.getOperand(2);
57799+
Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
57800+
Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
57801+
DL, InSizeInBits);
57802+
SDValue Shuffle =
57803+
DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
57804+
return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
57805+
}
57806+
break;
5779457807
}
5779557808
}
5779657809
}

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -240,21 +240,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
240240
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
241241
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
242242
; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
243-
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
244-
; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
243+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
245244
; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
246-
; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
247-
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
248-
; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1]
245+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
249246
; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
250-
; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
251247
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
252248
; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
253249
; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
254250
; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
255251
; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
256252
; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
257-
; AVX512-FCP-NEXT: vmovq %xmm1, (%rax)
253+
; AVX512-FCP-NEXT: vmovq %xmm8, (%rax)
258254
; AVX512-FCP-NEXT: vzeroupper
259255
; AVX512-FCP-NEXT: retq
260256
;
@@ -309,21 +305,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
309305
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
310306
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
311307
; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
312-
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
313-
; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
308+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
314309
; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
315-
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
316-
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
317-
; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1]
310+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
318311
; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
319-
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
320312
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
321313
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
322314
; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
323315
; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
324316
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
325317
; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
326-
; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rax)
318+
; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rax)
327319
; AVX512DQ-FCP-NEXT: vzeroupper
328320
; AVX512DQ-FCP-NEXT: retq
329321
;
@@ -378,21 +370,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
378370
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
379371
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6
380372
; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
381-
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
382-
; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
373+
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
383374
; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
384-
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
385-
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
386-
; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
375+
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
387376
; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
388-
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
389377
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
390378
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
391379
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
392380
; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
393381
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
394382
; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
395-
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax)
383+
; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax)
396384
; AVX512BW-FCP-NEXT: vzeroupper
397385
; AVX512BW-FCP-NEXT: retq
398386
;
@@ -447,21 +435,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
447435
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
448436
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6
449437
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
450-
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
451-
; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
438+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
452439
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
453-
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
454-
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
455-
; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
440+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
456441
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
457-
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
458442
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
459443
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
460444
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
461445
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
462446
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
463447
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
464-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax)
448+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax)
465449
; AVX512DQ-BW-FCP-NEXT: vzeroupper
466450
; AVX512DQ-BW-FCP-NEXT: retq
467451
%wide.vec = load <14 x i32>, ptr %in.vec, align 64

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
226226
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
227227
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
228228
; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
229-
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
230-
; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
229+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
231230
; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
232-
; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
233231
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
234232
; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
235233
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -293,10 +291,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
293291
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
294292
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
295293
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
296-
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
297-
; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
294+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
298295
; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
299-
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
300296
; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
301297
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
302298
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -360,10 +356,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
360356
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4
361357
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
362358
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
363-
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
364-
; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
359+
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
365360
; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
366-
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
367361
; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
368362
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
369363
; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -427,10 +421,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
427421
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4
428422
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
429423
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
430-
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
431-
; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
424+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5]
432425
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
433-
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
434426
; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
435427
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
436428
; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]

0 commit comments

Comments
 (0)