Skip to content

Commit b52962d

Browse files
committed
[X86] LowerVSELECT - split v16i16/v32i8 pre-AVX2 VSELECT ops if enough of the operands are free to split.
Often on AVX1 we're better off consistently using 128-bit instructions, so recognise when the operands are loads that can be freely/cheaply split - ideally this functionality needs to be moved to isFreeToSplitVector but we're using it in a few places where we don't want to split loads yet. Based off a regression reported after #92794
1 parent 654cd94 commit b52962d

File tree

2 files changed

+36
-20
lines changed

2 files changed

+36
-20
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17846,6 +17846,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
1784617846
return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
1784717847
}
1784817848

17849+
// v16i16/v32i8 selects without AVX2, if the condition and another operand
17850+
// are free to split, then better to split before expanding the
17851+
// select. Don't bother with XOP as it has the fast VPCMOV instruction.
17852+
// TODO: This is very similar to narrowVectorSelect.
17853+
// TODO: Add Load splitting to isFreeToSplitVector ?
17854+
if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
17855+
!Subtarget.hasXOP()) {
17856+
bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
17857+
bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
17858+
(ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
17859+
bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
17860+
(ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
17861+
if (FreeCond && (FreeLHS || FreeRHS))
17862+
return splitVectorOp(Op, DAG, dl);
17863+
}
17864+
1784917865
// Only some types will be legal on some subtargets. If we can emit a legal
1785017866
// VSELECT-matching blend, return Op, and but if we need to expand, return
1785117867
// a null value.

llvm/test/CodeGen/X86/vselect-pcmp.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1509,16 +1509,16 @@ define void @store_blend_load_v16i16(ptr %a0, ptr %a1, ptr %a2) {
15091509
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
15101510
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
15111511
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
1512-
; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm3
1513-
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
1514-
; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
1515-
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
1516-
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1517-
; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
1518-
; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
1519-
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1520-
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1521-
; AVX1-NEXT: vzeroupper
1512+
; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3
1513+
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm3
1514+
; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2
1515+
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
1516+
; AVX1-NEXT: vmovdqa (%rsi), %xmm4
1517+
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
1518+
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
1519+
; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
1520+
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
1521+
; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
15221522
; AVX1-NEXT: retq
15231523
;
15241524
; AVX2-LABEL: store_blend_load_v16i16:
@@ -1578,16 +1578,16 @@ define void @store_blend_load_v32i8(ptr %a0, ptr %a1, ptr %a2) {
15781578
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
15791579
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
15801580
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1581-
; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3
1582-
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
1583-
; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm2
1584-
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
1585-
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1586-
; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
1587-
; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
1588-
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1589-
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1590-
; AVX1-NEXT: vzeroupper
1581+
; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm3
1582+
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
1583+
; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm2
1584+
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
1585+
; AVX1-NEXT: vmovdqa (%rsi), %xmm4
1586+
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
1587+
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
1588+
; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
1589+
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
1590+
; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
15911591
; AVX1-NEXT: retq
15921592
;
15931593
; AVX2-LABEL: store_blend_load_v32i8:

0 commit comments

Comments
 (0)