Skip to content

Commit d3188c7

Browse files
committed
[X86] Add ISD::ABDS/ABDU vXi64 support on SSE41+ targets
If IMINMAX ops aren't legal, we can lower to the select(icmp(x,y),sub(x,y),sub(y,x)) pattern
1 parent 39e6bd9 commit d3188c7

File tree

7 files changed

+847
-1425
lines changed

7 files changed

+847
-1425
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,7 +1256,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
12561256
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
12571257
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
12581258

1259-
for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
1259+
for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
12601260
setOperationAction(ISD::ABDS, VT, Custom);
12611261
setOperationAction(ISD::ABDU, VT, Custom);
12621262
}
@@ -1396,14 +1396,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
13961396
// In the customized shift lowering, the legal v8i32/v4i64 cases
13971397
// in AVX2 will be recognized.
13981398
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1399-
setOperationAction(ISD::SRL, VT, Custom);
1400-
setOperationAction(ISD::SHL, VT, Custom);
1401-
setOperationAction(ISD::SRA, VT, Custom);
1399+
setOperationAction(ISD::SRL, VT, Custom);
1400+
setOperationAction(ISD::SHL, VT, Custom);
1401+
setOperationAction(ISD::SRA, VT, Custom);
1402+
setOperationAction(ISD::ABDS, VT, Custom);
1403+
setOperationAction(ISD::ABDU, VT, Custom);
14021404
if (VT == MVT::v4i64) continue;
1403-
setOperationAction(ISD::ROTL, VT, Custom);
1404-
setOperationAction(ISD::ROTR, VT, Custom);
1405-
setOperationAction(ISD::FSHL, VT, Custom);
1406-
setOperationAction(ISD::FSHR, VT, Custom);
1405+
setOperationAction(ISD::ROTL, VT, Custom);
1406+
setOperationAction(ISD::ROTR, VT, Custom);
1407+
setOperationAction(ISD::FSHL, VT, Custom);
1408+
setOperationAction(ISD::FSHR, VT, Custom);
14071409
}
14081410

14091411
// These types need custom splitting if their input is a 128-bit vector.
@@ -1499,8 +1501,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
14991501
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
15001502
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
15011503
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1502-
setOperationAction(ISD::ABDS, VT, Custom);
1503-
setOperationAction(ISD::ABDU, VT, Custom);
15041504
}
15051505

15061506
for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
@@ -1968,8 +1968,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
19681968
setOperationAction(ISD::SMIN, VT, Legal);
19691969
setOperationAction(ISD::UMIN, VT, Legal);
19701970
setOperationAction(ISD::ABS, VT, Legal);
1971-
setOperationAction(ISD::ABDS, VT, Custom);
1972-
setOperationAction(ISD::ABDU, VT, Custom);
19731971
}
19741972

19751973
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
@@ -29659,15 +29657,30 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
2965929657
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
2966029658
return splitVectorIntBinary(Op, DAG);
2966129659

29662-
// Default to expand: sub(smax(lhs,rhs),smin(lhs,rhs))
2966329660
// TODO: Add TargetLowering expandABD() support.
2966429661
SDLoc dl(Op);
2966529662
bool IsSigned = Op.getOpcode() == ISD::ABDS;
2966629663
SDValue LHS = DAG.getFreeze(Op.getOperand(0));
2966729664
SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29668-
SDValue Max = DAG.getNode(IsSigned ? ISD::SMAX : ISD::UMAX, dl, VT, LHS, RHS);
29669-
SDValue Min = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, dl, VT, LHS, RHS);
29670-
return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
29665+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29666+
29667+
// abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
29668+
// abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))
29669+
unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
29670+
unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
29671+
if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {
29672+
SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
29673+
SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
29674+
return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
29675+
}
29676+
29677+
// abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
29678+
// abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
29679+
EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29680+
ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
29681+
SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
29682+
return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
29683+
DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
2967129684
}
2967229685

2967329686
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

llvm/test/CodeGen/X86/abds-vector-128.ll

Lines changed: 45 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -501,89 +501,30 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
501501
;
502502
; SSE42-LABEL: abd_ext_v2i64:
503503
; SSE42: # %bb.0:
504-
; SSE42-NEXT: movq %xmm0, %rax
505-
; SSE42-NEXT: movq %rax, %rcx
506-
; SSE42-NEXT: sarq $63, %rcx
507-
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
508-
; SSE42-NEXT: movq %rdx, %rsi
509-
; SSE42-NEXT: sarq $63, %rsi
510-
; SSE42-NEXT: movq %xmm1, %rdi
511-
; SSE42-NEXT: movq %rdi, %r8
512-
; SSE42-NEXT: sarq $63, %r8
513-
; SSE42-NEXT: pextrq $1, %xmm1, %r9
514-
; SSE42-NEXT: movq %r9, %r10
515-
; SSE42-NEXT: sarq $63, %r10
516-
; SSE42-NEXT: subq %r9, %rdx
517-
; SSE42-NEXT: sbbq %r10, %rsi
518-
; SSE42-NEXT: subq %rdi, %rax
519-
; SSE42-NEXT: sbbq %r8, %rcx
520-
; SSE42-NEXT: sarq $63, %rcx
521-
; SSE42-NEXT: xorq %rcx, %rax
522-
; SSE42-NEXT: subq %rcx, %rax
523-
; SSE42-NEXT: sarq $63, %rsi
524-
; SSE42-NEXT: xorq %rsi, %rdx
525-
; SSE42-NEXT: subq %rsi, %rdx
526-
; SSE42-NEXT: movq %rdx, %xmm1
527-
; SSE42-NEXT: movq %rax, %xmm0
528-
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
504+
; SSE42-NEXT: movdqa %xmm0, %xmm2
505+
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
506+
; SSE42-NEXT: movdqa %xmm0, %xmm3
507+
; SSE42-NEXT: psubq %xmm1, %xmm3
508+
; SSE42-NEXT: psubq %xmm0, %xmm1
509+
; SSE42-NEXT: movdqa %xmm2, %xmm0
510+
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
511+
; SSE42-NEXT: movapd %xmm1, %xmm0
529512
; SSE42-NEXT: retq
530513
;
531514
; AVX1-LABEL: abd_ext_v2i64:
532515
; AVX1: # %bb.0:
533-
; AVX1-NEXT: vmovq %xmm0, %rax
534-
; AVX1-NEXT: movq %rax, %rcx
535-
; AVX1-NEXT: sarq $63, %rcx
536-
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
537-
; AVX1-NEXT: movq %rdx, %rsi
538-
; AVX1-NEXT: sarq $63, %rsi
539-
; AVX1-NEXT: vmovq %xmm1, %rdi
540-
; AVX1-NEXT: movq %rdi, %r8
541-
; AVX1-NEXT: sarq $63, %r8
542-
; AVX1-NEXT: vpextrq $1, %xmm1, %r9
543-
; AVX1-NEXT: movq %r9, %r10
544-
; AVX1-NEXT: sarq $63, %r10
545-
; AVX1-NEXT: subq %r9, %rdx
546-
; AVX1-NEXT: sbbq %r10, %rsi
547-
; AVX1-NEXT: subq %rdi, %rax
548-
; AVX1-NEXT: sbbq %r8, %rcx
549-
; AVX1-NEXT: sarq $63, %rcx
550-
; AVX1-NEXT: xorq %rcx, %rax
551-
; AVX1-NEXT: subq %rcx, %rax
552-
; AVX1-NEXT: sarq $63, %rsi
553-
; AVX1-NEXT: xorq %rsi, %rdx
554-
; AVX1-NEXT: subq %rsi, %rdx
555-
; AVX1-NEXT: vmovq %rdx, %xmm0
556-
; AVX1-NEXT: vmovq %rax, %xmm1
557-
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
516+
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
517+
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
518+
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
519+
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
558520
; AVX1-NEXT: retq
559521
;
560522
; AVX2-LABEL: abd_ext_v2i64:
561523
; AVX2: # %bb.0:
562-
; AVX2-NEXT: vmovq %xmm0, %rax
563-
; AVX2-NEXT: movq %rax, %rcx
564-
; AVX2-NEXT: sarq $63, %rcx
565-
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
566-
; AVX2-NEXT: movq %rdx, %rsi
567-
; AVX2-NEXT: sarq $63, %rsi
568-
; AVX2-NEXT: vmovq %xmm1, %rdi
569-
; AVX2-NEXT: movq %rdi, %r8
570-
; AVX2-NEXT: sarq $63, %r8
571-
; AVX2-NEXT: vpextrq $1, %xmm1, %r9
572-
; AVX2-NEXT: movq %r9, %r10
573-
; AVX2-NEXT: sarq $63, %r10
574-
; AVX2-NEXT: subq %r9, %rdx
575-
; AVX2-NEXT: sbbq %r10, %rsi
576-
; AVX2-NEXT: subq %rdi, %rax
577-
; AVX2-NEXT: sbbq %r8, %rcx
578-
; AVX2-NEXT: sarq $63, %rcx
579-
; AVX2-NEXT: xorq %rcx, %rax
580-
; AVX2-NEXT: subq %rcx, %rax
581-
; AVX2-NEXT: sarq $63, %rsi
582-
; AVX2-NEXT: xorq %rsi, %rdx
583-
; AVX2-NEXT: subq %rsi, %rdx
584-
; AVX2-NEXT: vmovq %rdx, %xmm0
585-
; AVX2-NEXT: vmovq %rax, %xmm1
586-
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
524+
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
525+
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
526+
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
527+
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
587528
; AVX2-NEXT: retq
588529
;
589530
; AVX512-LABEL: abd_ext_v2i64:
@@ -634,89 +575,30 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
634575
;
635576
; SSE42-LABEL: abd_ext_v2i64_undef:
636577
; SSE42: # %bb.0:
637-
; SSE42-NEXT: movq %xmm0, %rax
638-
; SSE42-NEXT: movq %rax, %rcx
639-
; SSE42-NEXT: sarq $63, %rcx
640-
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
641-
; SSE42-NEXT: movq %rdx, %rsi
642-
; SSE42-NEXT: sarq $63, %rsi
643-
; SSE42-NEXT: movq %xmm1, %rdi
644-
; SSE42-NEXT: movq %rdi, %r8
645-
; SSE42-NEXT: sarq $63, %r8
646-
; SSE42-NEXT: pextrq $1, %xmm1, %r9
647-
; SSE42-NEXT: movq %r9, %r10
648-
; SSE42-NEXT: sarq $63, %r10
649-
; SSE42-NEXT: subq %r9, %rdx
650-
; SSE42-NEXT: sbbq %r10, %rsi
651-
; SSE42-NEXT: subq %rdi, %rax
652-
; SSE42-NEXT: sbbq %r8, %rcx
653-
; SSE42-NEXT: sarq $63, %rcx
654-
; SSE42-NEXT: xorq %rcx, %rax
655-
; SSE42-NEXT: subq %rcx, %rax
656-
; SSE42-NEXT: sarq $63, %rsi
657-
; SSE42-NEXT: xorq %rsi, %rdx
658-
; SSE42-NEXT: subq %rsi, %rdx
659-
; SSE42-NEXT: movq %rdx, %xmm1
660-
; SSE42-NEXT: movq %rax, %xmm0
661-
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
578+
; SSE42-NEXT: movdqa %xmm0, %xmm2
579+
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
580+
; SSE42-NEXT: movdqa %xmm0, %xmm3
581+
; SSE42-NEXT: psubq %xmm1, %xmm3
582+
; SSE42-NEXT: psubq %xmm0, %xmm1
583+
; SSE42-NEXT: movdqa %xmm2, %xmm0
584+
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
585+
; SSE42-NEXT: movapd %xmm1, %xmm0
662586
; SSE42-NEXT: retq
663587
;
664588
; AVX1-LABEL: abd_ext_v2i64_undef:
665589
; AVX1: # %bb.0:
666-
; AVX1-NEXT: vmovq %xmm0, %rax
667-
; AVX1-NEXT: movq %rax, %rcx
668-
; AVX1-NEXT: sarq $63, %rcx
669-
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
670-
; AVX1-NEXT: movq %rdx, %rsi
671-
; AVX1-NEXT: sarq $63, %rsi
672-
; AVX1-NEXT: vmovq %xmm1, %rdi
673-
; AVX1-NEXT: movq %rdi, %r8
674-
; AVX1-NEXT: sarq $63, %r8
675-
; AVX1-NEXT: vpextrq $1, %xmm1, %r9
676-
; AVX1-NEXT: movq %r9, %r10
677-
; AVX1-NEXT: sarq $63, %r10
678-
; AVX1-NEXT: subq %r9, %rdx
679-
; AVX1-NEXT: sbbq %r10, %rsi
680-
; AVX1-NEXT: subq %rdi, %rax
681-
; AVX1-NEXT: sbbq %r8, %rcx
682-
; AVX1-NEXT: sarq $63, %rcx
683-
; AVX1-NEXT: xorq %rcx, %rax
684-
; AVX1-NEXT: subq %rcx, %rax
685-
; AVX1-NEXT: sarq $63, %rsi
686-
; AVX1-NEXT: xorq %rsi, %rdx
687-
; AVX1-NEXT: subq %rsi, %rdx
688-
; AVX1-NEXT: vmovq %rdx, %xmm0
689-
; AVX1-NEXT: vmovq %rax, %xmm1
690-
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
590+
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
591+
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
592+
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
593+
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
691594
; AVX1-NEXT: retq
692595
;
693596
; AVX2-LABEL: abd_ext_v2i64_undef:
694597
; AVX2: # %bb.0:
695-
; AVX2-NEXT: vmovq %xmm0, %rax
696-
; AVX2-NEXT: movq %rax, %rcx
697-
; AVX2-NEXT: sarq $63, %rcx
698-
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
699-
; AVX2-NEXT: movq %rdx, %rsi
700-
; AVX2-NEXT: sarq $63, %rsi
701-
; AVX2-NEXT: vmovq %xmm1, %rdi
702-
; AVX2-NEXT: movq %rdi, %r8
703-
; AVX2-NEXT: sarq $63, %r8
704-
; AVX2-NEXT: vpextrq $1, %xmm1, %r9
705-
; AVX2-NEXT: movq %r9, %r10
706-
; AVX2-NEXT: sarq $63, %r10
707-
; AVX2-NEXT: subq %r9, %rdx
708-
; AVX2-NEXT: sbbq %r10, %rsi
709-
; AVX2-NEXT: subq %rdi, %rax
710-
; AVX2-NEXT: sbbq %r8, %rcx
711-
; AVX2-NEXT: sarq $63, %rcx
712-
; AVX2-NEXT: xorq %rcx, %rax
713-
; AVX2-NEXT: subq %rcx, %rax
714-
; AVX2-NEXT: sarq $63, %rsi
715-
; AVX2-NEXT: xorq %rsi, %rdx
716-
; AVX2-NEXT: subq %rsi, %rdx
717-
; AVX2-NEXT: vmovq %rdx, %xmm0
718-
; AVX2-NEXT: vmovq %rax, %xmm1
719-
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
598+
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
599+
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
600+
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
601+
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
720602
; AVX2-NEXT: retq
721603
;
722604
; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -866,28 +748,29 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
866748
; SSE42-LABEL: abd_minmax_v2i64:
867749
; SSE42: # %bb.0:
868750
; SSE42-NEXT: movdqa %xmm0, %xmm2
869-
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
870-
; SSE42-NEXT: movdqa %xmm2, %xmm3
871-
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
872-
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
873-
; SSE42-NEXT: psubq %xmm3, %xmm1
874-
; SSE42-NEXT: movdqa %xmm1, %xmm0
751+
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
752+
; SSE42-NEXT: movdqa %xmm0, %xmm3
753+
; SSE42-NEXT: psubq %xmm1, %xmm3
754+
; SSE42-NEXT: psubq %xmm0, %xmm1
755+
; SSE42-NEXT: movdqa %xmm2, %xmm0
756+
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
757+
; SSE42-NEXT: movapd %xmm1, %xmm0
875758
; SSE42-NEXT: retq
876759
;
877760
; AVX1-LABEL: abd_minmax_v2i64:
878761
; AVX1: # %bb.0:
879762
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
880-
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3
881-
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
882-
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
763+
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
764+
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
765+
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
883766
; AVX1-NEXT: retq
884767
;
885768
; AVX2-LABEL: abd_minmax_v2i64:
886769
; AVX2: # %bb.0:
887770
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
888-
; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3
889-
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
890-
; AVX2-NEXT: vpsubq %xmm3, %xmm0, %xmm0
771+
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
772+
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
773+
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
891774
; AVX2-NEXT: retq
892775
;
893776
; AVX512-LABEL: abd_minmax_v2i64:

0 commit comments

Comments
 (0)