[GlobalISel] Combine [S,U]SUBO #116489

tschuett · 2024-11-16T15:58:02Z

We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub.

llvmbot · 2024-11-16T15:58:33Z

@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-backend-amdgpu

Author: Thorsten Schütt (tschuett)

Changes

We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub.

Patch is 90.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116489.diff

10 Files Affected:

(modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
(modified) llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h (+17)
(modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+15-2)
(modified) llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp (+72)
(modified) llvm/lib/Target/AArch64/AArch64Combine.td (+2-2)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir (+46)
(modified) llvm/test/CodeGen/AArch64/popcount.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+179-179)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+394-394)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+144-147)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b1232a368a3657..55c3b72c8e027f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -939,6 +939,9 @@ class CombinerHelper {
   // merge_values(_, zero) -> zext
   bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  // overflow sub
+  bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 4de14dee190fb3..9e5d4d34f24d2b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -486,6 +486,23 @@ class GAddCarryOut : public GBinOpCarryOut {
   }
 };
 
+/// Represents overflowing sub operations.
+/// G_USUBO, G_SSUBO
+class GSubCarryOut : public GBinOpCarryOut {
+public:
+  bool isSigned() const { return getOpcode() == TargetOpcode::G_SSUBO; }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_USUBO:
+    case TargetOpcode::G_SSUBO:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
 /// Represents overflowing add/sub operations that also consume a carry-in.
 /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
 class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f8379609bf1d98..b4f1551e965b14 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule<
         [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def match_subos : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SSUBO, G_USUBO):$root,
+        [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 def match_extract_of_element_undef_vector: GICombineRule <
   (defs root:$root),
   (match (G_IMPLICIT_DEF $vector),
@@ -1901,6 +1907,13 @@ def cmp_combines: GICombineGroup<[
   redundant_binop_in_equality
 ]>;
 
+
+def artifact_combines: GICombineGroup<[
+  merge_combines,
+  match_addos,
+  match_subos
+]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1984,9 +1997,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
-    combine_concat_vector, match_addos,
+    combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
-    combine_use_vector_truncate, merge_combines]>;
+    combine_use_vector_truncate, artifact_combines]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 797a1e84e21e35..85c56ee6863a95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -84,3 +84,75 @@ bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
   };
   return true;
 }
+
+bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) {
+  const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
+
+  Register Dst = Subo->getReg(0);
+  Register LHS = Subo->getLHSReg();
+  Register RHS = Subo->getRHSReg();
+  Register Carry = Subo->getCarryOutReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+
+  // Check legality before known bits.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
+      !isConstantLegalOrBeforeLegalizer(CarryTy))
+    return false;
+
+  if (Subo->isSigned()) {
+    // G_SSUBO
+    ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                                       /* IsSigned= */ true);
+    ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                                       /* IsSigned= */ true);
+    switch (KBLHS.signedSubMayOverflow(KBRHS)) {
+    case ConstantRange::OverflowResult::MayOverflow:
+      return false;
+    case ConstantRange::OverflowResult::NeverOverflows: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+        B.buildConstant(Carry, 0);
+      };
+      return true;
+    }
+    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS);
+        B.buildConstant(Carry, 1);
+      };
+      return true;
+    }
+    }
+    return false;
+  }
+
+  // G_USUBO
+  ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                                     /* IsSigned= */ false);
+  ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                                     /* IsSigned= */ false);
+  switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return false;
+  case ConstantRange::OverflowResult::NeverOverflows: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS);
+      B.buildConstant(Carry, 1);
+    };
+    return true;
+  }
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac4..23563bf9b7881f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -320,7 +320,7 @@ def AArch64PostLegalizerCombiner
                         hoist_logic_op_with_same_opcode_hands,
                         redundant_and, xor_of_and_with_same_reg,
                         extractvecelt_pairwise_add, redundant_or,
-                        mul_const, redundant_sext_inreg,
+                        mul_const, redundant_sext_inreg, artifact_combines,
                         form_bitfield_extract, rotate_out_of_range,
                         icmp_to_true_false_known_bits,
                         select_combines, fold_merge_to_zext,
@@ -328,7 +328,7 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs, merge_combines,
+                        commute_constant_to_rhs,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index bc4b5ae7c066a6..87b30f558539c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -176,3 +176,49 @@ body:             |
     $q1 = COPY %o_wide
     RET_ReallyLR implicit $w0
 ...
+---
+name:            sub_may
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sub_may
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_SSUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s1) = G_SSUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            usub_may
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_USUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s1) = G_USUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 1fc4de1c48b7dd..f9f1cd4b1fcf76 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -113,9 +113,9 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ;
 ; GISEL-LABEL: popcount256:
 ; GISEL:       // %bb.0: // %Entry
-; GISEL-NEXT:    ldp x8, x9, [x0, #16]
+; GISEL-NEXT:    ldp x8, x9, [x0]
 ; GISEL-NEXT:    mov v0.d[0], x8
-; GISEL-NEXT:    ldp x8, x10, [x0]
+; GISEL-NEXT:    ldp x8, x10, [x0, #16]
 ; GISEL-NEXT:    mov v1.d[0], x8
 ; GISEL-NEXT:    mov v0.d[1], x9
 ; GISEL-NEXT:    mov v1.d[1], x10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 613c73f7b9368b..14b30e0d79946c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1178,212 +1178,212 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v17, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v18, v13
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8,...
[truncated]

llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp

llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir

llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp

llvm/include/llvm/Target/GlobalISel/Combine.td

aemerson · 2024-11-16T20:54:34Z

llvm/include/llvm/Target/GlobalISel/Combine.td

+
+def artifact_combines: GICombineGroup<[
+  merge_combines,
+  match_addos,


These aren't artifacts.

Well, the legalizer builds addos while narrowing, e.g., adds:

llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Line 6289 in 7642759

OpO = TargetOpcode::G_UADDO;

Maybe artifacts is too strict. I stated in the summary that we import addos and the legalizer builds them while legalizing non addos.

That doesn't make them artifacts. There are lots of instructions that the legalizer builds like loads/stores but those aren't artifacts either.

Ok, I need a mechanism to combine addos in AArch64PostLegalizerCombiner and I want to have a group where I/we can add more combines that run in the post legalizer combiner. Is it sufficient to rename artifact_combines to something like post_legalizer_combines ?

Shouldn't name these as post-legalizer combines in this file. Each target defines their own pipeline. Name the groups based on their category and then refer to them in any custom groups needed within AArch64Combine.td

I splitted them again into two topic groups.

aemerson

One last thing, also not sure if AMDGPU tests are better or not, wait for Matt to finish looking at it.

aemerson · 2024-11-18T17:49:53Z

llvm/include/llvm/Target/GlobalISel/Combine.td

@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule<
        [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;

+def match_subo_no_overflow : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SSUBO, G_USUBO):$root,


Could you just use match instead of wip_match_opcode?

Firstly, it was copy-pasted from match_addos above. Secondly, I believe we can match only one tree/graph per match. I can split it into two combines if you want.

I don't like it either. There is no prior-art for matches with two output operands.

It's not a big deal, we can try to covert them en-masse in another change.

My real/only issue with wip_match_opcodeis when the docstring says, e.g. G_FNEG(G_BUILD_VECTOR), and the user is doing wip_match_opcode with G_FNEG. It looses precision.

aemerson · 2024-11-19T01:45:46Z

I have another request in general for your changes. Could you please add some more appropriate descriptions to your PRs so that when you merge them, we end up with a better commit message?

For example, for this commit, we could've done with a description of what the actual combine did. Instead, we got some commentary on some implementation details that aren't very helpful to the reader of the commit.

[GlobalISel] Combine [S,U]SUBO

86a0535

We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub.

tschuett requested review from aemerson and arsenm November 16, 2024 15:58

llvmbot added backend:AArch64 backend:AMDGPU llvm:globalisel labels Nov 16, 2024

arsenm reviewed Nov 16, 2024

View reviewed changes

address review comments

8a72eaf

aemerson reviewed Nov 16, 2024

View reviewed changes

Thorsten Schütt added 2 commits November 17, 2024 07:34

moving and renaming

7e42092

redorder groups

b840b9e

aemerson reviewed Nov 18, 2024

View reviewed changes

arsenm approved these changes Nov 18, 2024

View reviewed changes

tschuett merged commit f8d1905 into llvm:main Nov 18, 2024
8 checks passed

tschuett deleted the gisel-subo branch November 18, 2024 21:39

[GlobalISel] Combine [S,U]SUBO #116489

[GlobalISel] Combine [S,U]SUBO #116489

Uh oh!

Conversation

tschuett commented Nov 16, 2024

Uh oh!

llvmbot commented Nov 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

aemerson Nov 16, 2024

Choose a reason for hiding this comment

Uh oh!

tschuett Nov 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

aemerson Nov 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

tschuett Nov 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

aemerson Nov 18, 2024

Choose a reason for hiding this comment

Uh oh!

tschuett Nov 18, 2024

Choose a reason for hiding this comment

Uh oh!

aemerson left a comment

Choose a reason for hiding this comment

Uh oh!

aemerson Nov 18, 2024

Choose a reason for hiding this comment

Uh oh!

tschuett Nov 18, 2024

Choose a reason for hiding this comment

Uh oh!

tschuett Nov 18, 2024

Choose a reason for hiding this comment

Uh oh!

aemerson Nov 18, 2024

Choose a reason for hiding this comment

Uh oh!

tschuett Nov 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

aemerson commented Nov 19, 2024

Uh oh!

Uh oh!

llvmbot commented Nov 16, 2024 •

edited

Loading

tschuett Nov 16, 2024 •

edited

Loading

aemerson Nov 16, 2024 •

edited

Loading

tschuett Nov 16, 2024 •

edited

Loading

tschuett Nov 18, 2024 •

edited

Loading