[DAGCombiner] Add support for scalarising extracts of a vector setcc

david-arm · david-arm · commit f22c13219a7e · 2024-11-28T09:38:35.000Z
For IR like this:

  %icmp = icmp ult &lt;4 x i32&gt; %a, splat (i32 5)
  %res = extractelement &lt;4 x i1&gt; %icmp, i32 1

where there is only one use of %icmp we can take a similar approach
to what we already do for binary ops such add, sub, etc. and convert
this into

  %ext = extractelement &lt;4 x i32&gt; %a, i32 1
  %res = icmp ult i32 %ext, 5

For AArch64 targets at least the scalar boolean result will almost
certainly need to be in a GPR anyway, since it will probably be
used by branches for control flow. I've tried to reuse existing code
in scalarizeExtractedBinop to also work for setcc.

NOTE: The optimisations don't apply for tests such as
extract_icmp_v4i32_splat_rhs in the file

CodeGen/AArch64/extract-vector-cmp.ll

because scalarizeExtractedBinOp only works if one of the input
operands is a constant.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22746,16 +22746,21 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
 /// Transform a vector binary operation into a scalar binary operation by moving
 /// the math/logic after an extract element of a vector.
-static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
-                                       const SDLoc &DL, bool LegalOperations) {
+static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
+                                       const SDLoc &DL) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+  unsigned Opc = Vec.getOpcode();
+  if (!IndexC || !Vec.hasOneUse() || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) ||
       Vec->getNumValues() != 1)
     return SDValue();
 
+  EVT ResVT = ExtElt->getValueType(0);
+  if (Opc == ISD::SETCC && ResVT != Vec.getValueType().getVectorElementType())
+    return SDValue();
+
   // Targets may want to avoid this to prevent an expensive register transfer.
   if (!TLI.shouldScalarizeBinop(Vec))
     return SDValue();
@@ -22766,19 +22771,24 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Op0 = Vec.getOperand(0);
   SDValue Op1 = Vec.getOperand(1);
   APInt SplatVal;
-  if (isAnyConstantBuildVector(Op0, true) ||
-      ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
-      isAnyConstantBuildVector(Op1, true) ||
-      ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
-    // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
-    // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
-    EVT VT = ExtElt->getValueType(0);
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
-    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
-  }
+  if (!isAnyConstantBuildVector(Op0, true) &&
+      !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
+      !isAnyConstantBuildVector(Op1, true) &&
+      !ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
+    return SDValue();
 
-  return SDValue();
+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  if (Opc == ISD::SETCC) {
+    EVT OpVT = Op0->getValueType(0).getVectorElementType();
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
+    return DAG.getSetCC(DL, ResVT, Op0, Op1,
+                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+  }
+  Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op0, Index);
+  Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op1, Index);
+  return DAG.getNode(Opc, DL, ResVT, Op0, Op1);
 }
 
 // Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23011,7 +23021,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
+  if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
     return BO;
 
   if (VecVT.isScalableVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  bool shouldScalarizeBinop(SDValue VecOp) const override {
+    return VecOp.getOpcode() == ISD::SETCC;
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3300,7 +3300,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -7,11 +7,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
 ; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4s, #5
-; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #5
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %icmp = icmp ult <4 x i32> %a, splat (i32 5)
   %ext = extractelement <4 x i1> %icmp, i32 1
@@ -21,11 +19,9 @@ define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
 define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
 ; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4s, #7
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %icmp = icmp ult <4 x i32> splat(i32 7), %a
   %ext = extractelement <4 x i1> %icmp, i32 1
@@ -35,12 +31,9 @@ define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
 define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
 ; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI2_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #234
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
   %ext = extractelement <4 x i1> %icmp, i32 1
@@ -50,12 +43,10 @@ define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
 define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
 ; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov v1.4s, #4.00000000
-; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
 ; CHECK-NEXT:    ret
   %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
   %ext = extractelement <4 x i1> %fcmp, i32 1
@@ -66,39 +57,34 @@ define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
 ; CHECK-LABEL: vector_loop_with_icmp:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    mov w8, #2 // =0x2
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    dup v2.2d, x9
-; CHECK-NEXT:    add x9, x0, #4
-; CHECK-NEXT:    mov w10, #16 // =0x10
-; CHECK-NEXT:    mov w11, #1 // =0x1
+; CHECK-NEXT:    add x8, x0, #4
+; CHECK-NEXT:    mov w10, #1 // =0x1
 ; CHECK-NEXT:    b .LBB4_2
 ; CHECK-NEXT:  .LBB4_1: // %pred.store.continue6
 ; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    subs x10, x10, #2
-; CHECK-NEXT:    add x9, x9, #8
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    subs x9, x9, #2
+; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    b.eq .LBB4_6
 ; CHECK-NEXT:  .LBB4_2: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    cmhi v3.2d, v1.2d, v0.2d
-; CHECK-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEXT:    fmov w12, s3
-; CHECK-NEXT:    tbz w12, #0, .LBB4_4
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_4
 ; CHECK-NEXT:  // %bb.3: // %pred.store.if
 ; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    stur w11, [x9, #-4]
+; CHECK-NEXT:    stur w10, [x8, #-4]
 ; CHECK-NEXT:  .LBB4_4: // %pred.store.continue
 ; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    dup v3.2d, x8
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v0.2d
-; CHECK-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEXT:    mov w12, v3.s[1]
-; CHECK-NEXT:    tbz w12, #0, .LBB4_1
+; CHECK-NEXT:    mov x11, v0.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_1
 ; CHECK-NEXT:  // %bb.5: // %pred.store.if5
 ; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    str w11, [x9]
+; CHECK-NEXT:    str w10, [x8]
 ; CHECK-NEXT:    b .LBB4_1
 ; CHECK-NEXT:  .LBB4_6: // %for.cond.cleanup
 ; CHECK-NEXT:    ret