[AArch64] Eliminate Common Subexpression of CSEL by Reassociation

mskamp · mskamp · commit b3f6a01bcf4b · 2024-12-30T14:53:51.000+01:00
If we have a CSEL instruction that depends on the flags set by a (SUBS x c) instruction and the true and/or false expression is (add (add x y) -c), we can reassociate the latter expression to (add (SUBS x c) y) and save one instruction. The transformation works for unsigned comparisons and equality comparisons with 0 (by converting them to unsigned comparisons). Proof for the basic transformation: https://alive2.llvm.org/ce/z/-337Pb Fixes #119606.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24838,6 +24838,83 @@ static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
   return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
 }
 
+// Reassociate the true/false expressions of a CSEL instruction to obtain a
+// common subexpression with the comparison instruction. For example, change
+// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
+// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
+// subexpression.
+static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
+  SDValue SubsNode = N->getOperand(3);
+  if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
+    return SDValue();
+  auto *CmpOpConst = dyn_cast<ConstantSDNode>(SubsNode.getOperand(1));
+  if (!CmpOpConst)
+    return SDValue();
+
+  auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
+  bool IsEquality = CC == AArch64CC::EQ || CC == AArch64CC::NE;
+  if (CC != AArch64CC::LO && CC != AArch64CC::HI &&
+      (!IsEquality || !CmpOpConst->isZero()))
+    return SDValue();
+  // The cases (x < c) and (x == 0) are later unified as (x < newconst).
+  // The cases (x > c) and (x != 0) are later unified as (x >= newconst).
+  APInt NewCmpConst = CC == AArch64CC::LO ? CmpOpConst->getAPIntValue()
+                                          : CmpOpConst->getAPIntValue() + 1;
+  APInt ExpectedConst = -NewCmpConst;
+
+  SDValue CmpOpOther = SubsNode.getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
+                               DAG.getVTList(VT, MVT_CC), CmpOpOther,
+                               DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
+                                               CmpOpConst->getValueType(0)));
+
+  auto Reassociate = [&](SDValue Op) {
+    if (Op.getOpcode() != ISD::ADD)
+      return SDValue();
+    auto *AddOpConst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!AddOpConst)
+      return SDValue();
+    if (IsEquality && AddOpConst->getAPIntValue() != ExpectedConst)
+      return SDValue();
+    if (!IsEquality && AddOpConst->getAPIntValue() != ExpectedConst)
+      return SDValue();
+    if (Op.getOperand(0).getOpcode() != ISD::ADD ||
+        !Op.getOperand(0).hasOneUse())
+      return SDValue();
+    SDValue X = Op.getOperand(0).getOperand(0);
+    SDValue Y = Op.getOperand(0).getOperand(1);
+    if (X != CmpOpOther)
+      std::swap(X, Y);
+    if (X != CmpOpOther)
+      return SDValue();
+    SDNodeFlags Flags;
+    if (Op.getOperand(0).getNode()->getFlags().hasNoUnsignedWrap())
+      Flags.setNoUnsignedWrap(true);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), VT, NewCmp.getValue(0), Y, Flags);
+  };
+
+  SDValue TValReassoc = Reassociate(N->getOperand(0));
+  SDValue FValReassoc = Reassociate(N->getOperand(1));
+  if (!TValReassoc && !FValReassoc)
+    return SDValue();
+  if (TValReassoc)
+    DAG.ReplaceAllUsesWith(N->getOperand(0), TValReassoc);
+  else
+    TValReassoc = N->getOperand(0);
+  if (FValReassoc)
+    DAG.ReplaceAllUsesWith(N->getOperand(1), FValReassoc);
+  else
+    FValReassoc = N->getOperand(1);
+
+  AArch64CC::CondCode NewCC = CC == AArch64CC::EQ || CC == AArch64CC::LO
+                                  ? AArch64CC::LO
+                                  : AArch64CC::HS;
+  return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
+                     DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC),
+                     NewCmp.getValue(1));
+}
+
 // Optimize CSEL instructions
 static SDValue performCSELCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
@@ -24849,6 +24926,11 @@ static SDValue performCSELCombine(SDNode *N,
   if (SDValue R = foldCSELOfCSEL(N, DAG))
     return R;
 
+  // Try to reassociate the true/false expressions so that we can do CSE with
+  // a SUBS instruction used to perform the comparison.
+  if (SDValue R = reassociateCSELOperandsForCSE(N, DAG))
+    return R;
+
   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
diff --git a/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll b/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
@@ -8,10 +8,9 @@ declare void @use_i32(i32 %x)
 define ptr @test_last_elem_from_ptr(ptr noundef readnone %x0, i64 noundef %x1) {
 ; CHECK-LABEL: test_last_elem_from_ptr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    csel x0, xzr, x8, eq
+; CHECK-NEXT:    subs x8, x1, #1
+; CHECK-NEXT:    add x8, x8, x0
+; CHECK-NEXT:    csel x0, xzr, x8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i64 %x1, 0
   %add.ptr = getelementptr inbounds nuw i8, ptr %x0, i64 %x1
@@ -23,10 +22,9 @@ define ptr @test_last_elem_from_ptr(ptr noundef readnone %x0, i64 noundef %x1) {
 define i32 @test_eq0_sub_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_eq0_sub_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, #0
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i32 %x1, 0
   %add = add nuw i32 %x0, %x1
@@ -38,9 +36,8 @@ define i32 @test_eq0_sub_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_ule7_sub_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_ule7_sub_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, #8
-; CHECK-NEXT:    sub w8, w8, #8
+; CHECK-NEXT:    subs w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
 ; CHECK-NEXT:    csel w0, wzr, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ule i32 %x1, 7
@@ -53,10 +50,9 @@ define i32 @test_ule7_sub_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_ule0_sub_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_ule0_sub_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, #0
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ule i32 %x1, 0
   %add = add i32 %x0, %x1
@@ -68,9 +64,8 @@ define i32 @test_ule0_sub_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_ultminus2_sub_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_ultminus2_sub_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmn w1, #2
-; CHECK-NEXT:    add w8, w8, #2
+; CHECK-NEXT:    adds w8, w1, #2
+; CHECK-NEXT:    add w8, w8, w0
 ; CHECK-NEXT:    csel w0, wzr, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i32 %x1, -2
@@ -83,10 +78,9 @@ define i32 @test_ultminus2_sub_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_ne0_sub_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_ne0_sub_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, #0
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    csel w0, w8, wzr, ne
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i32 %x1, 0
   %add = add i32 %x0, %x1
@@ -98,10 +92,9 @@ define i32 @test_ne0_sub_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_ugt7_sub_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_ugt7_sub_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, #7
-; CHECK-NEXT:    sub w8, w8, #8
-; CHECK-NEXT:    csel w0, wzr, w8, hi
+; CHECK-NEXT:    subs w8, w1, #8
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hs
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt i32 %x1, 7
   %add = add i32 %x0, %x1
@@ -113,10 +106,9 @@ define i32 @test_ugt7_sub_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_eq0_sub_addcomm_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_eq0_sub_addcomm_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w1, w0
-; CHECK-NEXT:    cmp w1, #0
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i32 %x1, 0
   %add = add i32 %x1, %x0
@@ -128,10 +120,9 @@ define i32 @test_eq0_sub_addcomm_i32(i32 %x0, i32 %x1) {
 define i32 @test_eq0_subcomm_add_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_eq0_subcomm_add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, #0
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i32 %x1, 0
   %add = add i32 %x0, %x1
@@ -143,21 +134,16 @@ define i32 @test_eq0_subcomm_add_i32(i32 %x0, i32 %x1) {
 define i32 @test_eq0_multi_use_sub_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_eq0_multi_use_sub_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    mov w19, w1
-; CHECK-NEXT:    sub w20, w8, #1
-; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    subs w8, w1, #1
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    csel w19, wzr, w0, lo
 ; CHECK-NEXT:    bl use_i32
-; CHECK-NEXT:    cmp w19, #0
-; CHECK-NEXT:    csel w0, wzr, w20, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    mov w0, w19
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i32 %x1, 0
   %add = add nuw i32 %x0, %x1