Skip to content

Commit e117295

Browse files
committed
[X86][AVX] canonicalizeLaneShuffleWithRepeatedOps - merge VPERMILPD ops with different low/high masks.
Now that PR48908 has been dealt with, we can handle v4f64 permute cases by extracting the low/high lane VPERMILPD masks and creating a new mask based on which lanes are referenced by the VPERM2F128 mask.
1 parent 518af8d commit e117295

File tree

2 files changed

+38
-26
lines changed

2 files changed

+38
-26
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36946,11 +36946,27 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
3694636946
return DAG.getBitcast(VT, Res);
3694736947
}
3694836948
case X86ISD::VPERMILPI:
36949-
// TODO: Handle v4f64 permutes with different low/high lane masks.
36949+
// Handle v4f64 permutes with different low/high lane masks by permuting
36950+
// the permute mask on a lane-by-lane basis.
3695036951
if (SrcVT0 == MVT::v4f64) {
36951-
uint64_t Mask = Src0.getConstantOperandVal(1);
36952-
if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
36953-
break;
36952+
if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
36953+
uint64_t LaneMask = V.getConstantOperandVal(2);
36954+
uint64_t Mask = Src0.getConstantOperandVal(1);
36955+
uint64_t LoMask = Mask & 0x3;
36956+
uint64_t HiMask = (Mask >> 2) & 0x3;
36957+
uint64_t NewMask = 0;
36958+
NewMask |= ((LaneMask & 0x02) ? HiMask : LoMask);
36959+
NewMask |= ((LaneMask & 0x02) ? HiMask : LoMask) << 2;
36960+
SDValue LHS = Src0.getOperand(0);
36961+
SDValue RHS =
36962+
Src1.isUndef() ? DAG.getUNDEF(SrcVT0) : Src1.getOperand(0);
36963+
SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
36964+
V.getOperand(2));
36965+
Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res,
36966+
DAG.getTargetConstant(NewMask, DL, MVT::i8));
36967+
return DAG.getBitcast(VT, Res);
36968+
}
36969+
break;
3695436970
}
3695536971
LLVM_FALLTHROUGH;
3695636972
case X86ISD::VSHLI:

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -442,18 +442,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
442442
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
443443
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
444444
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
445-
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
446-
; X86-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
447-
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
448-
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5
449-
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
445+
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
446+
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
447+
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
448+
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
450449
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
451-
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
452-
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
453-
; X86-AVX1-NEXT: vmovapd %ymm3, (%edx)
454-
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
455-
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
456-
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
450+
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
451+
; X86-AVX1-NEXT: vmovapd %ymm4, (%edx)
452+
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
453+
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
454+
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
457455
; X86-AVX1-NEXT: vmovapd %ymm3, (%ecx)
458456
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
459457
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
@@ -515,18 +513,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
515513
;
516514
; X64-AVX1-LABEL: PR48908:
517515
; X64-AVX1: # %bb.0:
518-
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
519-
; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
520-
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
521-
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5
522-
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
516+
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
517+
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
518+
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
519+
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
523520
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
524-
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
525-
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
526-
; X64-AVX1-NEXT: vmovapd %ymm3, (%rdi)
527-
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
528-
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
529-
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
521+
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
522+
; X64-AVX1-NEXT: vmovapd %ymm4, (%rdi)
523+
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
524+
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
525+
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
530526
; X64-AVX1-NEXT: vmovapd %ymm3, (%rsi)
531527
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
532528
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]

0 commit comments

Comments
 (0)