[DAGCombiner] Add support for scalarising extracts of a vector setcc #117566

david-arm · 2024-11-25T14:53:07Z

For IR like this:

%icmp = icmp ult <4 x i32> %a, splat (i32 5)
%res = extractelement <4 x i1> %icmp, i32 1

where there is only one use of %icmp we can take a similar approach
to what we already do for binary ops such add, sub, etc. and convert
this into

%ext = extractelement <4 x i32> %a, i32 1
%res = icmp ult i32 %ext, 5

For AArch64 targets at least the scalar boolean result will almost
certainly need to be in a GPR anyway, since it will probably be
used by branches for control flow. I've tried to reuse existing code
in scalarizeExtractedBinop to also work for setcc.

NOTE: The optimisations don't apply for tests such as
extract_icmp_v4i32_splat_rhs in the file

CodeGen/AArch64/extract-vector-cmp.ll

because scalarizeExtractedBinOp only works if one of the input
operands is a constant.

llvmbot · 2024-11-25T14:53:41Z

@llvm/pr-subscribers-backend-webassembly

@llvm/pr-subscribers-backend-risc-v

Author: David Sherwood (david-arm)

Changes

For IR like this:

%icmp = icmp ult <4 x i32> %a, splat (i32 5)
%res = extractelement <4 x i1> %icmp, i32 1

where there is only one use of %icmp we can take a similar approach
to what we already do for binary ops such add, sub, etc. and convert
this into

%ext = extractelement <4 x i32> %a, i32 1
%res = icmp ult i32 %ext, 5

For AArch64 targets at least the scalar boolean result will almost
certainly need to be in a GPR anyway, since it will probably be
used by branches for control flow. I've tried to reuse existing code
in scalarizeExtractedBinop to also work for setcc.

NOTE: The optimisations don't apply for tests such as
extract_icmp_v4i32_splat_rhs in the file

CodeGen/AArch64/extract-vector-cmp.ll

because scalarizeExtractedBinOp only works if one of the input
operands is a constant.

Full diff: https://github.com/llvm/llvm-project/pull/117566.diff

8 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+27-16)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+4)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-1)
(modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+24-22)
(added) llvm/test/CodeGen/AArch64/extract-vector-cmp.ll (+225)
(modified) llvm/test/CodeGen/X86/vselect.ll (+26)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 521829675ae7c3..e8e4e4e9146531 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22746,16 +22746,21 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
 /// Transform a vector binary operation into a scalar binary operation by moving
 /// the math/logic after an extract element of a vector.
-static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
-                                       const SDLoc &DL, bool LegalOperations) {
+static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
+                                       const SDLoc &DL) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+  unsigned Opc = Vec.getOpcode();
+  if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||
       Vec->getNumValues() != 1)
     return SDValue();
 
+  EVT ResVT = ExtElt->getValueType(0);
+  if (Opc == ISD::SETCC && ResVT != Vec.getValueType().getVectorElementType())
+    return SDValue();
+
   // Targets may want to avoid this to prevent an expensive register transfer.
   if (!TLI.shouldScalarizeBinop(Vec))
     return SDValue();
@@ -22766,19 +22771,25 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Op0 = Vec.getOperand(0);
   SDValue Op1 = Vec.getOperand(1);
   APInt SplatVal;
-  if (isAnyConstantBuildVector(Op0, true) ||
-      ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
-      isAnyConstantBuildVector(Op1, true) ||
-      ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
-    // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
-    // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
-    EVT VT = ExtElt->getValueType(0);
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
-    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
+  if (!isAnyConstantBuildVector(Op0, true) &&
+      !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
+      !isAnyConstantBuildVector(Op1, true) &&
+      !ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
+    return SDValue();
+
+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  if (Opc == ISD::SETCC) {
+    EVT OpVT = Op0->getValueType(0).getVectorElementType();
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
+    return DAG.getSetCC(DL, ResVT, Op0, Op1,
+                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+  } else {
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op1, Index);
+    return DAG.getNode(Opc, DL, ResVT, Op0, Op1);
   }
-
-  return SDValue();
 }
 
 // Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23011,7 +23022,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
+  if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
     return BO;
 
   if (VecVT.isScalableVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9e965277aa..d51b36f7e49946 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  bool shouldScalarizeBinop(SDValue VecOp) const override {
+    return VecOp.getOpcode() == ISD::SETCC;
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6eae756b25fb5b..c0021f69f18e77 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2d00889407ff48..a52af6832d583f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9048d1d83f1874..d3b569eccc17be 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3300,7 +3300,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 5a5dee0b53d439..4cb1d5b2fb345d 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -5,7 +5,7 @@
 
 declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)
 
-define fastcc i8 @allocno_reload_assign() {
+define fastcc i8 @allocno_reload_assign(ptr %p) {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
@@ -14,8 +14,8 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    uzp1 p8.b, p0.b, p0.b
+; CHECK-NEXT:    mov z0.b, p8/z, #1 // =0x1
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
@@ -30,34 +30,35 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p1.b
+; CHECK-NEXT:    punpkhi p4.h, p1.b
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
-; CHECK-NEXT:    punpklo p5.h, p0.b
+; CHECK-NEXT:    punpklo p6.h, p0.b
 ; CHECK-NEXT:    uunpklo z2.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    punpkhi p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z4.d, z5.s
 ; CHECK-NEXT:    uunpkhi z5.d, z5.s
 ; CHECK-NEXT:    uunpklo z6.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    punpklo p0.h, p2.b
-; CHECK-NEXT:    punpkhi p1.h, p2.b
-; CHECK-NEXT:    punpklo p2.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    punpklo p4.h, p5.b
-; CHECK-NEXT:    punpkhi p5.h, p5.b
-; CHECK-NEXT:    punpklo p6.h, p7.b
-; CHECK-NEXT:    punpkhi p7.h, p7.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    punpklo p3.h, p4.b
+; CHECK-NEXT:    punpkhi p4.h, p4.b
+; CHECK-NEXT:    punpklo p5.h, p6.b
+; CHECK-NEXT:    punpkhi p6.h, p6.b
+; CHECK-NEXT:    punpklo p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    st1b { z0.d }, p0, [z16.d]
-; CHECK-NEXT:    st1b { z1.d }, p1, [z16.d]
-; CHECK-NEXT:    st1b { z2.d }, p2, [z16.d]
-; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
-; CHECK-NEXT:    st1b { z5.d }, p5, [z16.d]
-; CHECK-NEXT:    st1b { z6.d }, p6, [z16.d]
-; CHECK-NEXT:    st1b { z7.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z0.d }, p1, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p2, [z16.d]
+; CHECK-NEXT:    st1b { z2.d }, p3, [z16.d]
+; CHECK-NEXT:    st1b { z3.d }, p4, [z16.d]
+; CHECK-NEXT:    st1b { z4.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z5.d }, p6, [z16.d]
+; CHECK-NEXT:    st1b { z6.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p0, [z16.d]
+; CHECK-NEXT:    str p8, [x0]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 
@@ -66,6 +67,7 @@ define fastcc i8 @allocno_reload_assign() {
   %constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
   call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
+  store <vscale x 16 x i1> %constexpr, ptr %p, align 16
   br label %1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
new file mode 100644
index 00000000000000..2388a0f206a51c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #5
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat (i32 5)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> splat(i32 7), %a
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #234
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
+; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
+  %ext = extractelement <4 x i1> %fcmp, i32 1
+  ret i1 %ext
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, #2 // =0x2
+; CHECK-NEXT:    b .LBB4_2
+; CHECK-NEXT:  .LBB4_1: // %pred.store.continue18
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    subs x9, x9, #4
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.eq .LBB4_10
+; CHECK-NEXT:  .LBB4_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %pred.store.if
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-8]
+; CHECK-NEXT:  .LBB4_4: // %pred.store.continue
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v0.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_6
+; CHECK-NEXT:  // %bb.5: // %pred.store.if5
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-4]
+; CHECK-NEXT:  .LBB4_6: // %pred.store.continue6
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_8
+; CHECK-NEXT:  // %bb.7: // %pred.store.if7
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:  .LBB4_8: // %pred.store.continue8
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v1.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_1
+; CHECK-NEXT:  // %bb.9: // %pred.store.if9
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8, #4]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_10: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+
+; Negative tests
+
+define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %ins = insertelement <4 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %icmp = icmp ult <4 x i32> %a, %splat
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #235
+; CHECK-NEXT:    adrp x9, .LCPI6_0
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_0]
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v1.4h, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    and w0, w9, #0x1
+; CHECK-NEXT:    strb w10, [x8]
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 235)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  store <4 x i1> %icmp, ptr %p, align 4
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi v1.4s, #127
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    bfi x8, x0, #1, #2
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 127)
+  %ext = extractelement <4 x i1> %icmp, i32 %c
+  ret i1 %ext
+}
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 9acd995d612c31..be6ee8f6899584 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -796,3 +796,29 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
   ret i64 %4
 }
 
+; Tests the scalarizeBinOp code in DAGCombiner
+define void @scalarize_binop(<1 x i1> %a) {
+; SSE-LABEL: scalarize_binop:
+; SSE:       # %bb.0: # %bb0
+; SSE-NEXT:    .p2align 4
+; SSE-NEXT:  .LBB35_1: # %bb1
+; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE-NEXT:    jmp .LBB35_1
+;
+; AVX-LABEL: scalarize_binop:
+; AVX:       # %bb.0: # %bb0
+; AVX-NEXT:    .p2align 4
+; AVX-NEXT:  .LBB35_1: # %bb1
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    jmp .LBB35_1
+bb0:
+  br label %bb1
+
+bb1:
+  %b = select <1 x i1> %a, <1 x i1> zeroinitializer, <1 x i1> splat (i1 true)
+  br label %bb2
+
+bb2:
+  %c = extractelement <1 x i1> %b, i32 0
+  br label %bb1
+}

llvmbot · 2024-11-25T14:53:41Z

@llvm/pr-subscribers-backend-aarch64

Author: David Sherwood (david-arm)

Changes

For IR like this:

%icmp = icmp ult <4 x i32> %a, splat (i32 5)
%res = extractelement <4 x i1> %icmp, i32 1

where there is only one use of %icmp we can take a similar approach
to what we already do for binary ops such add, sub, etc. and convert
this into

%ext = extractelement <4 x i32> %a, i32 1
%res = icmp ult i32 %ext, 5

For AArch64 targets at least the scalar boolean result will almost
certainly need to be in a GPR anyway, since it will probably be
used by branches for control flow. I've tried to reuse existing code
in scalarizeExtractedBinop to also work for setcc.

NOTE: The optimisations don't apply for tests such as
extract_icmp_v4i32_splat_rhs in the file

CodeGen/AArch64/extract-vector-cmp.ll

because scalarizeExtractedBinOp only works if one of the input
operands is a constant.

Full diff: https://github.com/llvm/llvm-project/pull/117566.diff

8 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+27-16)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+4)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-1)
(modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+24-22)
(added) llvm/test/CodeGen/AArch64/extract-vector-cmp.ll (+225)
(modified) llvm/test/CodeGen/X86/vselect.ll (+26)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 521829675ae7c3..e8e4e4e9146531 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22746,16 +22746,21 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
 /// Transform a vector binary operation into a scalar binary operation by moving
 /// the math/logic after an extract element of a vector.
-static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
-                                       const SDLoc &DL, bool LegalOperations) {
+static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
+                                       const SDLoc &DL) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+  unsigned Opc = Vec.getOpcode();
+  if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||
       Vec->getNumValues() != 1)
     return SDValue();
 
+  EVT ResVT = ExtElt->getValueType(0);
+  if (Opc == ISD::SETCC && ResVT != Vec.getValueType().getVectorElementType())
+    return SDValue();
+
   // Targets may want to avoid this to prevent an expensive register transfer.
   if (!TLI.shouldScalarizeBinop(Vec))
     return SDValue();
@@ -22766,19 +22771,25 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Op0 = Vec.getOperand(0);
   SDValue Op1 = Vec.getOperand(1);
   APInt SplatVal;
-  if (isAnyConstantBuildVector(Op0, true) ||
-      ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
-      isAnyConstantBuildVector(Op1, true) ||
-      ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
-    // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
-    // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
-    EVT VT = ExtElt->getValueType(0);
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
-    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
+  if (!isAnyConstantBuildVector(Op0, true) &&
+      !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
+      !isAnyConstantBuildVector(Op1, true) &&
+      !ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
+    return SDValue();
+
+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  if (Opc == ISD::SETCC) {
+    EVT OpVT = Op0->getValueType(0).getVectorElementType();
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
+    return DAG.getSetCC(DL, ResVT, Op0, Op1,
+                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+  } else {
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op1, Index);
+    return DAG.getNode(Opc, DL, ResVT, Op0, Op1);
   }
-
-  return SDValue();
 }
 
 // Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23011,7 +23022,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
+  if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
     return BO;
 
   if (VecVT.isScalableVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9e965277aa..d51b36f7e49946 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  bool shouldScalarizeBinop(SDValue VecOp) const override {
+    return VecOp.getOpcode() == ISD::SETCC;
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6eae756b25fb5b..c0021f69f18e77 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2d00889407ff48..a52af6832d583f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9048d1d83f1874..d3b569eccc17be 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3300,7 +3300,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 5a5dee0b53d439..4cb1d5b2fb345d 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -5,7 +5,7 @@
 
 declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)
 
-define fastcc i8 @allocno_reload_assign() {
+define fastcc i8 @allocno_reload_assign(ptr %p) {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
@@ -14,8 +14,8 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    uzp1 p8.b, p0.b, p0.b
+; CHECK-NEXT:    mov z0.b, p8/z, #1 // =0x1
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
@@ -30,34 +30,35 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p1.b
+; CHECK-NEXT:    punpkhi p4.h, p1.b
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
-; CHECK-NEXT:    punpklo p5.h, p0.b
+; CHECK-NEXT:    punpklo p6.h, p0.b
 ; CHECK-NEXT:    uunpklo z2.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    punpkhi p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z4.d, z5.s
 ; CHECK-NEXT:    uunpkhi z5.d, z5.s
 ; CHECK-NEXT:    uunpklo z6.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    punpklo p0.h, p2.b
-; CHECK-NEXT:    punpkhi p1.h, p2.b
-; CHECK-NEXT:    punpklo p2.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    punpklo p4.h, p5.b
-; CHECK-NEXT:    punpkhi p5.h, p5.b
-; CHECK-NEXT:    punpklo p6.h, p7.b
-; CHECK-NEXT:    punpkhi p7.h, p7.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    punpklo p3.h, p4.b
+; CHECK-NEXT:    punpkhi p4.h, p4.b
+; CHECK-NEXT:    punpklo p5.h, p6.b
+; CHECK-NEXT:    punpkhi p6.h, p6.b
+; CHECK-NEXT:    punpklo p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    st1b { z0.d }, p0, [z16.d]
-; CHECK-NEXT:    st1b { z1.d }, p1, [z16.d]
-; CHECK-NEXT:    st1b { z2.d }, p2, [z16.d]
-; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
-; CHECK-NEXT:    st1b { z5.d }, p5, [z16.d]
-; CHECK-NEXT:    st1b { z6.d }, p6, [z16.d]
-; CHECK-NEXT:    st1b { z7.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z0.d }, p1, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p2, [z16.d]
+; CHECK-NEXT:    st1b { z2.d }, p3, [z16.d]
+; CHECK-NEXT:    st1b { z3.d }, p4, [z16.d]
+; CHECK-NEXT:    st1b { z4.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z5.d }, p6, [z16.d]
+; CHECK-NEXT:    st1b { z6.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p0, [z16.d]
+; CHECK-NEXT:    str p8, [x0]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 
@@ -66,6 +67,7 @@ define fastcc i8 @allocno_reload_assign() {
   %constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
   call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
+  store <vscale x 16 x i1> %constexpr, ptr %p, align 16
   br label %1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
new file mode 100644
index 00000000000000..2388a0f206a51c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #5
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat (i32 5)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> splat(i32 7), %a
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #234
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
+; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
+  %ext = extractelement <4 x i1> %fcmp, i32 1
+  ret i1 %ext
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, #2 // =0x2
+; CHECK-NEXT:    b .LBB4_2
+; CHECK-NEXT:  .LBB4_1: // %pred.store.continue18
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    subs x9, x9, #4
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.eq .LBB4_10
+; CHECK-NEXT:  .LBB4_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %pred.store.if
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-8]
+; CHECK-NEXT:  .LBB4_4: // %pred.store.continue
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v0.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_6
+; CHECK-NEXT:  // %bb.5: // %pred.store.if5
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-4]
+; CHECK-NEXT:  .LBB4_6: // %pred.store.continue6
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_8
+; CHECK-NEXT:  // %bb.7: // %pred.store.if7
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:  .LBB4_8: // %pred.store.continue8
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v1.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_1
+; CHECK-NEXT:  // %bb.9: // %pred.store.if9
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8, #4]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_10: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+
+; Negative tests
+
+define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %ins = insertelement <4 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %icmp = icmp ult <4 x i32> %a, %splat
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #235
+; CHECK-NEXT:    adrp x9, .LCPI6_0
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_0]
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v1.4h, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    and w0, w9, #0x1
+; CHECK-NEXT:    strb w10, [x8]
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 235)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  store <4 x i1> %icmp, ptr %p, align 4
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi v1.4s, #127
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    bfi x8, x0, #1, #2
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 127)
+  %ext = extractelement <4 x i1> %icmp, i32 %c
+  ret i1 %ext
+}
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 9acd995d612c31..be6ee8f6899584 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -796,3 +796,29 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
   ret i64 %4
 }
 
+; Tests the scalarizeBinOp code in DAGCombiner
+define void @scalarize_binop(<1 x i1> %a) {
+; SSE-LABEL: scalarize_binop:
+; SSE:       # %bb.0: # %bb0
+; SSE-NEXT:    .p2align 4
+; SSE-NEXT:  .LBB35_1: # %bb1
+; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE-NEXT:    jmp .LBB35_1
+;
+; AVX-LABEL: scalarize_binop:
+; AVX:       # %bb.0: # %bb0
+; AVX-NEXT:    .p2align 4
+; AVX-NEXT:  .LBB35_1: # %bb1
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    jmp .LBB35_1
+bb0:
+  br label %bb1
+
+bb1:
+  %b = select <1 x i1> %a, <1 x i1> zeroinitializer, <1 x i1> splat (i1 true)
+  br label %bb2
+
+bb2:
+  %c = extractelement <1 x i1> %b, i32 0
+  br label %bb1
+}

llvmbot · 2024-11-25T14:53:41Z

@llvm/pr-subscribers-llvm-selectiondag

Author: David Sherwood (david-arm)

Changes

For IR like this:

%icmp = icmp ult <4 x i32> %a, splat (i32 5)
%res = extractelement <4 x i1> %icmp, i32 1

where there is only one use of %icmp we can take a similar approach
to what we already do for binary ops such add, sub, etc. and convert
this into

%ext = extractelement <4 x i32> %a, i32 1
%res = icmp ult i32 %ext, 5

For AArch64 targets at least the scalar boolean result will almost
certainly need to be in a GPR anyway, since it will probably be
used by branches for control flow. I've tried to reuse existing code
in scalarizeExtractedBinop to also work for setcc.

NOTE: The optimisations don't apply for tests such as
extract_icmp_v4i32_splat_rhs in the file

CodeGen/AArch64/extract-vector-cmp.ll

because scalarizeExtractedBinOp only works if one of the input
operands is a constant.

Full diff: https://github.com/llvm/llvm-project/pull/117566.diff

8 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+27-16)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+4)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-1)
(modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+24-22)
(added) llvm/test/CodeGen/AArch64/extract-vector-cmp.ll (+225)
(modified) llvm/test/CodeGen/X86/vselect.ll (+26)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 521829675ae7c3..e8e4e4e9146531 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22746,16 +22746,21 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
 /// Transform a vector binary operation into a scalar binary operation by moving
 /// the math/logic after an extract element of a vector.
-static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
-                                       const SDLoc &DL, bool LegalOperations) {
+static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
+                                       const SDLoc &DL) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+  unsigned Opc = Vec.getOpcode();
+  if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||
       Vec->getNumValues() != 1)
     return SDValue();
 
+  EVT ResVT = ExtElt->getValueType(0);
+  if (Opc == ISD::SETCC && ResVT != Vec.getValueType().getVectorElementType())
+    return SDValue();
+
   // Targets may want to avoid this to prevent an expensive register transfer.
   if (!TLI.shouldScalarizeBinop(Vec))
     return SDValue();
@@ -22766,19 +22771,25 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Op0 = Vec.getOperand(0);
   SDValue Op1 = Vec.getOperand(1);
   APInt SplatVal;
-  if (isAnyConstantBuildVector(Op0, true) ||
-      ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
-      isAnyConstantBuildVector(Op1, true) ||
-      ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
-    // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
-    // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
-    EVT VT = ExtElt->getValueType(0);
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
-    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
+  if (!isAnyConstantBuildVector(Op0, true) &&
+      !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
+      !isAnyConstantBuildVector(Op1, true) &&
+      !ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
+    return SDValue();
+
+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  if (Opc == ISD::SETCC) {
+    EVT OpVT = Op0->getValueType(0).getVectorElementType();
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
+    return DAG.getSetCC(DL, ResVT, Op0, Op1,
+                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+  } else {
+    Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op0, Index);
+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op1, Index);
+    return DAG.getNode(Opc, DL, ResVT, Op0, Op1);
   }
-
-  return SDValue();
 }
 
 // Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23011,7 +23022,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
+  if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
     return BO;
 
   if (VecVT.isScalableVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9e965277aa..d51b36f7e49946 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  bool shouldScalarizeBinop(SDValue VecOp) const override {
+    return VecOp.getOpcode() == ISD::SETCC;
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6eae756b25fb5b..c0021f69f18e77 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2d00889407ff48..a52af6832d583f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9048d1d83f1874..d3b569eccc17be 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3300,7 +3300,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 5a5dee0b53d439..4cb1d5b2fb345d 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -5,7 +5,7 @@
 
 declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)
 
-define fastcc i8 @allocno_reload_assign() {
+define fastcc i8 @allocno_reload_assign(ptr %p) {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
@@ -14,8 +14,8 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    uzp1 p8.b, p0.b, p0.b
+; CHECK-NEXT:    mov z0.b, p8/z, #1 // =0x1
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
@@ -30,34 +30,35 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p1.b
+; CHECK-NEXT:    punpkhi p4.h, p1.b
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
-; CHECK-NEXT:    punpklo p5.h, p0.b
+; CHECK-NEXT:    punpklo p6.h, p0.b
 ; CHECK-NEXT:    uunpklo z2.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    punpkhi p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z4.d, z5.s
 ; CHECK-NEXT:    uunpkhi z5.d, z5.s
 ; CHECK-NEXT:    uunpklo z6.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    punpklo p0.h, p2.b
-; CHECK-NEXT:    punpkhi p1.h, p2.b
-; CHECK-NEXT:    punpklo p2.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    punpklo p4.h, p5.b
-; CHECK-NEXT:    punpkhi p5.h, p5.b
-; CHECK-NEXT:    punpklo p6.h, p7.b
-; CHECK-NEXT:    punpkhi p7.h, p7.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    punpklo p3.h, p4.b
+; CHECK-NEXT:    punpkhi p4.h, p4.b
+; CHECK-NEXT:    punpklo p5.h, p6.b
+; CHECK-NEXT:    punpkhi p6.h, p6.b
+; CHECK-NEXT:    punpklo p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    st1b { z0.d }, p0, [z16.d]
-; CHECK-NEXT:    st1b { z1.d }, p1, [z16.d]
-; CHECK-NEXT:    st1b { z2.d }, p2, [z16.d]
-; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
-; CHECK-NEXT:    st1b { z5.d }, p5, [z16.d]
-; CHECK-NEXT:    st1b { z6.d }, p6, [z16.d]
-; CHECK-NEXT:    st1b { z7.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z0.d }, p1, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p2, [z16.d]
+; CHECK-NEXT:    st1b { z2.d }, p3, [z16.d]
+; CHECK-NEXT:    st1b { z3.d }, p4, [z16.d]
+; CHECK-NEXT:    st1b { z4.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z5.d }, p6, [z16.d]
+; CHECK-NEXT:    st1b { z6.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p0, [z16.d]
+; CHECK-NEXT:    str p8, [x0]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 
@@ -66,6 +67,7 @@ define fastcc i8 @allocno_reload_assign() {
   %constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
   call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
+  store <vscale x 16 x i1> %constexpr, ptr %p, align 16
   br label %1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
new file mode 100644
index 00000000000000..2388a0f206a51c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #5
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat (i32 5)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> splat(i32 7), %a
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #234
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
+; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
+  %ext = extractelement <4 x i1> %fcmp, i32 1
+  ret i1 %ext
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, #2 // =0x2
+; CHECK-NEXT:    b .LBB4_2
+; CHECK-NEXT:  .LBB4_1: // %pred.store.continue18
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    subs x9, x9, #4
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.eq .LBB4_10
+; CHECK-NEXT:  .LBB4_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %pred.store.if
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-8]
+; CHECK-NEXT:  .LBB4_4: // %pred.store.continue
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v0.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_6
+; CHECK-NEXT:  // %bb.5: // %pred.store.if5
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-4]
+; CHECK-NEXT:  .LBB4_6: // %pred.store.continue6
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_8
+; CHECK-NEXT:  // %bb.7: // %pred.store.if7
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:  .LBB4_8: // %pred.store.continue8
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v1.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_1
+; CHECK-NEXT:  // %bb.9: // %pred.store.if9
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8, #4]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_10: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+
+; Negative tests
+
+define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %ins = insertelement <4 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %icmp = icmp ult <4 x i32> %a, %splat
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #235
+; CHECK-NEXT:    adrp x9, .LCPI6_0
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_0]
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v1.4h, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    and w0, w9, #0x1
+; CHECK-NEXT:    strb w10, [x8]
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 235)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  store <4 x i1> %icmp, ptr %p, align 4
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi v1.4s, #127
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    bfi x8, x0, #1, #2
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 127)
+  %ext = extractelement <4 x i1> %icmp, i32 %c
+  ret i1 %ext
+}
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 9acd995d612c31..be6ee8f6899584 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -796,3 +796,29 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
   ret i64 %4
 }
 
+; Tests the scalarizeBinOp code in DAGCombiner
+define void @scalarize_binop(<1 x i1> %a) {
+; SSE-LABEL: scalarize_binop:
+; SSE:       # %bb.0: # %bb0
+; SSE-NEXT:    .p2align 4
+; SSE-NEXT:  .LBB35_1: # %bb1
+; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE-NEXT:    jmp .LBB35_1
+;
+; AVX-LABEL: scalarize_binop:
+; AVX:       # %bb.0: # %bb0
+; AVX-NEXT:    .p2align 4
+; AVX-NEXT:  .LBB35_1: # %bb1
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    jmp .LBB35_1
+bb0:
+  br label %bb1
+
+bb1:
+  %b = select <1 x i1> %a, <1 x i1> zeroinitializer, <1 x i1> splat (i1 true)
+  br label %bb2
+
+bb2:
+  %c = extractelement <1 x i1> %b, i32 0
+  br label %bb1
+}

arsenm · 2024-11-25T16:47:57Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  SDValue Vec = ExtElt->getOperand(0);
  SDValue Index = ExtElt->getOperand(1);
  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+  unsigned Opc = Vec.getOpcode();
+  if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||


Suggested change

if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||

if (!IndexC || !Vec.hasOneUse() || (Opc != ISD::SETCC && !TLI.isBinOp(Opc)) ||

arsenm · 2024-11-25T16:53:47Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+  if (!isAnyConstantBuildVector(Op0, true) &&
+      !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
+      !isAnyConstantBuildVector(Op1, true) &&
+      !ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
+    return SDValue();


This is a confusing predicate. It would be better to rewrite in terms of one predicate on each operand

I see we already have a confusing number of predicate functions in SelectionDAG and DAGCombiner related to asking similar, but not identical questions about whether an operand is a constant splat/non-constant splat/constant build vector/etc. If it's ok with you I'd prefer not to add one more for now in this patch, but I'm happy to do a follow-on patch that tries to rationalise/refactor some of the existing interfaces. Hopefully that will then allow me to rewrite and simplify this code. What do you think?

Sure, follow up. I always find these predicates confusing and there never seems to be the exact right one

arsenm · 2024-11-25T16:53:54Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+    Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
+    return DAG.getSetCC(DL, ResVT, Op0, Op1,
+                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+  } else {


No else after return

arsenm · 2024-11-25T16:54:06Z

llvm/test/CodeGen/AArch64/extract-vector-cmp.ll

+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1


Use named values in tests

arsenm · 2024-11-25T16:54:28Z

llvm/test/CodeGen/AArch64/extract-vector-cmp.ll

+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:


Function really big for a dag combine test

I agree it was probably too big. I was trying to highlight the motivating example from LoopVectorize.cpp and demonstrate the improvement. I've reduced the test to use a <2 x i64> instead to reduce the size. Hopefully this is small enough now!

I've also tried to properly name all the variables.

paulwalker-arm · 2024-12-03T12:46:31Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  if (Opc == ISD::SETCC) {
+    EVT OpVT = Op0->getValueType(0).getVectorElementType();


If the combine occurs after type legalisation it's possible for OpVT to be a non-legal type despite Op0 being type legal.

Perhaps it's worth passing in LegalTypes (instead of the original LegalOperations) to be checked as part of the Opc == ISD::SETCC early exit?

The stock binop case works because it assumes the operation is independent of the element type, which is not true for comparisons.

paulwalker-arm · 2024-12-03T15:05:22Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  if (Opc == ISD::SETCC) {
+    EVT OpVT = Op0->getValueType(0).getVectorElementType();


Suggested change

EVT OpVT = Op0->getValueType(0).getVectorElementType();

EVT OpVT = Op0.getValueType().getVectorElementType();

* Rewrote the test in dag-combine-concat-vectors.ll to ensure there is more than one use of the vector comparison.

For IR like this: %icmp = icmp ult <4 x i32> %a, splat (i32 5) %res = extractelement <4 x i1> %icmp, i32 1 where there is only one use of %icmp we can take a similar approach to what we already do for binary ops such add, sub, etc. and convert this into %ext = extractelement <4 x i32> %a, i32 1 %res = icmp ult i32 %ext, 5 For AArch64 targets at least the scalar boolean result will almost certainly need to be in a GPR anyway, since it will probably be used by branches for control flow. I've tried to reuse existing code in scalarizeExtractedBinop to also work for setcc. NOTE: The optimisations don't apply for tests such as extract_icmp_v4i32_splat_rhs in the file CodeGen/AArch64/extract-vector-cmp.ll because scalarizeExtractedBinOp only works if one of the input operands is a constant.

vitalybuka · 2024-12-04T20:20:59Z

This patch breaks https://lab.llvm.org/buildbot/#/builders/55/builds/3959

…r setcc" (#118693) Reverts #117566 Breaks libc++ tests with HWASAN https://lab.llvm.org/buildbot/#/builders/55/builds/3959

vitalybuka · 2024-12-04T20:43:59Z

@david-arm Please acknowledge that you noticed the revert.

Please let me know if you need help with reproducer, but I guess it's quite straight forward either using https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild or copy-pasting cmake commands from the bot.

david-arm · 2024-12-05T09:30:43Z

@david-arm Please acknowledge that you noticed the revert.

Please let me know if you need help with reproducer, but I guess it's quite straight forward either using https://github.com/google/sanitizers/wiki/SanitizerBotReproduceBuild or copy-pasting cmake commands from the bot.

Yep, I'm taking a look now! The error does look suspiciously like it's caused by this patch.

…or setcc (#117566)" (#118823) [Reverts d57892a] For IR like this: %icmp = icmp ult <4 x i32> %a, splat (i32 5) %res = extractelement <4 x i1> %icmp, i32 1 where there is only one use of %icmp we can take a similar approach to what we already do for binary ops such add, sub, etc. and convert this into %ext = extractelement <4 x i32> %a, i32 1 %res = icmp ult i32 %ext, 5 For AArch64 targets at least the scalar boolean result will almost certainly need to be in a GPR anyway, since it will probably be used by branches for control flow. I've tried to reuse existing code in scalarizeExtractedBinop to also work for setcc. NOTE: The optimisations don't apply for tests such as extract_icmp_v4i32_splat_rhs in the file CodeGen/AArch64/extract-vector-cmp.ll because scalarizeExtractedBinOp only works if one of the input operands is a constant. --------- Co-authored-by: Paul Walker <[email protected]>

david-arm requested review from arsenm, RKSimon, davemgreen and paulwalker-arm November 25, 2024 14:53

llvmbot added backend:AArch64 backend:RISC-V backend:WebAssembly backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Nov 25, 2024

arsenm reviewed Nov 25, 2024

View reviewed changes

david-arm force-pushed the extract_vcmp branch from 00df763 to f22c132 Compare November 28, 2024 09:40

paulwalker-arm reviewed Dec 3, 2024

View reviewed changes

paulwalker-arm approved these changes Dec 3, 2024

View reviewed changes

arsenm approved these changes Dec 3, 2024

View reviewed changes

david-arm added 4 commits December 4, 2024 09:50

Add tests

4717e0c

* Rewrote the test in dag-combine-concat-vectors.ll to ensure there is more than one use of the vector comparison.

Address review comment

cd917c8

Address review comment

50618d6

david-arm force-pushed the extract_vcmp branch from 57c3d09 to 50618d6 Compare December 4, 2024 09:51

david-arm merged commit 4675db5 into llvm:main Dec 4, 2024
6 of 8 checks passed

vitalybuka mentioned this pull request Dec 4, 2024

Revert "[DAGCombiner] Add support for scalarising extracts of a vector setcc" #118693

Merged

vitalybuka added a commit that referenced this pull request Dec 4, 2024

Revert "[DAGCombiner] Add support for scalarising extracts of a vecto…

d57892a

…r setcc" (#118693) Reverts #117566 Breaks libc++ tests with HWASAN https://lab.llvm.org/buildbot/#/builders/55/builds/3959

david-arm mentioned this pull request Dec 5, 2024

Reapply "[DAGCombiner] Add support for scalarising extracts of a vector setcc (#117566)" #118823

Merged

	if (!IndexC \|\| (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) \|\| !Vec.hasOneUse() \|\|
	if (!IndexC \|\| !Vec.hasOneUse() \|\| (Opc != ISD::SETCC && !TLI.isBinOp(Opc)) \|\|

	EVT OpVT = Op0->getValueType(0).getVectorElementType();
	EVT OpVT = Op0.getValueType().getVectorElementType();

[DAGCombiner] Add support for scalarising extracts of a vector setcc #117566

[DAGCombiner] Add support for scalarising extracts of a vector setcc #117566

Uh oh!

Conversation

david-arm commented Nov 25, 2024

Uh oh!

llvmbot commented Nov 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 25, 2024

Uh oh!

llvmbot commented Nov 25, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

vitalybuka commented Dec 4, 2024

Uh oh!

vitalybuka commented Dec 4, 2024

Uh oh!

david-arm commented Dec 5, 2024

Uh oh!

Uh oh!

llvmbot commented Nov 25, 2024 •

edited

Loading