-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GlobalISel] Combine [S,U]SUBO #116489
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GlobalISel] Combine [S,U]SUBO #116489
Conversation
We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub.
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Thorsten Schütt (tschuett) ChangesWe import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub. Patch is 90.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116489.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b1232a368a3657..55c3b72c8e027f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -939,6 +939,9 @@ class CombinerHelper {
// merge_values(_, zero) -> zext
bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+ // overflow sub
+ bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 4de14dee190fb3..9e5d4d34f24d2b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -486,6 +486,23 @@ class GAddCarryOut : public GBinOpCarryOut {
}
};
+/// Represents overflowing sub operations.
+/// G_USUBO, G_SSUBO
+class GSubCarryOut : public GBinOpCarryOut {
+public:
+ bool isSigned() const { return getOpcode() == TargetOpcode::G_SSUBO; }
+
+ static bool classof(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_USUBO:
+ case TargetOpcode::G_SSUBO:
+ return true;
+ default:
+ return false;
+ }
+ }
+};
+
/// Represents overflowing add/sub operations that also consume a carry-in.
/// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f8379609bf1d98..b4f1551e965b14 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule<
[{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def match_subos : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_SSUBO, G_USUBO):$root,
+ [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
def match_extract_of_element_undef_vector: GICombineRule <
(defs root:$root),
(match (G_IMPLICIT_DEF $vector),
@@ -1901,6 +1907,13 @@ def cmp_combines: GICombineGroup<[
redundant_binop_in_equality
]>;
+
+def artifact_combines: GICombineGroup<[
+ merge_combines,
+ match_addos,
+ match_subos
+]>;
+
// FIXME: These should use the custom predicate feature once it lands.
def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
undef_to_negative_one,
@@ -1984,9 +1997,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
sub_add_reg, select_to_minmax,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
- combine_concat_vector, match_addos,
+ combine_concat_vector,
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
- combine_use_vector_truncate, merge_combines]>;
+ combine_use_vector_truncate, artifact_combines]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 797a1e84e21e35..85c56ee6863a95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -84,3 +84,75 @@ bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
};
return true;
}
+
+bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
+
+ Register Dst = Subo->getReg(0);
+ Register LHS = Subo->getLHSReg();
+ Register RHS = Subo->getRHSReg();
+ Register Carry = Subo->getCarryOutReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT CarryTy = MRI.getType(Carry);
+
+ // Check legality before known bits.
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
+ !isConstantLegalOrBeforeLegalizer(CarryTy))
+ return false;
+
+ if (Subo->isSigned()) {
+ // G_SSUBO
+ ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+ /* IsSigned= */ true);
+ ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+ /* IsSigned= */ true);
+ switch (KBLHS.signedSubMayOverflow(KBRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // G_USUBO
+ ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+ /* IsSigned= */ false);
+ ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+ /* IsSigned= */ false);
+ switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac4..23563bf9b7881f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -320,7 +320,7 @@ def AArch64PostLegalizerCombiner
hoist_logic_op_with_same_opcode_hands,
redundant_and, xor_of_and_with_same_reg,
extractvecelt_pairwise_add, redundant_or,
- mul_const, redundant_sext_inreg,
+ mul_const, redundant_sext_inreg, artifact_combines,
form_bitfield_extract, rotate_out_of_range,
icmp_to_true_false_known_bits,
select_combines, fold_merge_to_zext,
@@ -328,7 +328,7 @@ def AArch64PostLegalizerCombiner
ptr_add_immed_chain, overlapping_and,
split_store_zero_128, undef_combines,
select_to_minmax, or_to_bsp, combine_concat_vector,
- commute_constant_to_rhs, merge_combines,
+ commute_constant_to_rhs,
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt, combine_use_vector_truncate]> {
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index bc4b5ae7c066a6..87b30f558539c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -176,3 +176,49 @@ body: |
$q1 = COPY %o_wide
RET_ReallyLR implicit $w0
...
+---
+name: sub_may
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: sub_may
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+ ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_SSUBO [[COPY]], %const
+ ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+ ; CHECK-NEXT: $w0 = COPY %sub(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %const:_(s32) = G_CONSTANT i32 512
+ %sub:_(s32), %o:_(s1) = G_SSUBO %0, %const
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %sub(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
+---
+name: usub_may
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: usub_may
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+ ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_USUBO [[COPY]], %const
+ ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+ ; CHECK-NEXT: $w0 = COPY %sub(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %const:_(s32) = G_CONSTANT i32 512
+ %sub:_(s32), %o:_(s1) = G_USUBO %0, %const
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %sub(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 1fc4de1c48b7dd..f9f1cd4b1fcf76 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -113,9 +113,9 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
;
; GISEL-LABEL: popcount256:
; GISEL: // %bb.0: // %Entry
-; GISEL-NEXT: ldp x8, x9, [x0, #16]
+; GISEL-NEXT: ldp x8, x9, [x0]
; GISEL-NEXT: mov v0.d[0], x8
-; GISEL-NEXT: ldp x8, x10, [x0]
+; GISEL-NEXT: ldp x8, x10, [x0, #16]
; GISEL-NEXT: mov v1.d[0], x8
; GISEL-NEXT: mov v0.d[1], x9
; GISEL-NEXT: mov v1.d[1], x10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 613c73f7b9368b..14b30e0d79946c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1178,212 +1178,212 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT: s_subb_u32 s7, 0, 0
+; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v6, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT: v_mov_b32_e32 v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT: v_trunc_f32_e32 v8, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_mov_b32_e32 v8, v5
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT: v_mov_b32_e32 v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT: v_mov_b32_e32 v4, v14
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v17, v18, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8,...
[truncated]
|
|
||
def artifact_combines: GICombineGroup<[ | ||
merge_combines, | ||
match_addos, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These aren't artifacts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, the legalizer builds addos while narrowing, e.g., adds:
OpO = TargetOpcode::G_UADDO; |
Maybe artifacts is too strict. I stated in the summary that we import addos and the legalizer builds them while legalizing non addos.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That doesn't make them artifacts. There are lots of instructions that the legalizer builds like loads/stores but those aren't artifacts either.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I need a mechanism to combine addos in AArch64PostLegalizerCombiner and I want to have a group where I/we can add more combines that run in the post legalizer combiner. Is it sufficient to rename artifact_combines
to something like post_legalizer_combines
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't name these as post-legalizer combines in this file. Each target defines their own pipeline. Name the groups based on their category and then refer to them in any custom groups needed within AArch64Combine.td
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I splitted them again into two topic groups.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One last thing, also not sure if AMDGPU tests are better or not, wait for Matt to finish looking at it.
@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule< | |||
[{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]), | |||
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; | |||
|
|||
def match_subo_no_overflow : GICombineRule< | |||
(defs root:$root, build_fn_matchinfo:$matchinfo), | |||
(match (wip_match_opcode G_SSUBO, G_USUBO):$root, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you just use match
instead of wip_match_opcode
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Firstly, it was copy-pasted from match_addos
above. Secondly, I believe we can match only one tree/graph per match. I can split it into two combines if you want.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like it either. There is no prior-art for matches with two output operands.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not a big deal, we can try to covert them en-masse in another change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My real/only issue with wip_match_opcode
is when the docstring says, e.g. G_FNEG(G_BUILD_VECTOR), and the user is doing wip_match_opcode
with G_FNEG. It looses precision.
I have another request in general for your changes. Could you please add some more appropriate descriptions to your PRs so that when you merge them, we end up with a better commit message? For example, for this commit, we could've done with a description of what the actual combine did. Instead, we got some commentary on some implementation details that aren't very helpful to the reader of the commit. |
We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub.