-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][SDAG] Try folding "lshr i64 + mad" to "mad_u64_u32" #119218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Vikram Hegde (vikramRH) ChangesThis is essentially a revert of #76285. The intention is to use a "copy" instead of a "sub" to handle the high parts of 64-bit multiply for this specific case. (ref https://alive2.llvm.org/ce/z/H0iuty and https://godbolt.org/z/ba4sja8jh). This unlocks copy prop use cases where the copy can be reused by later multiply+add sequences if possible. Fixes: SWDEV-487672 Full diff: https://github.com/llvm/llvm-project/pull/119218.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f89fe8faa600ba..c18db9f65dc08e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13857,6 +13857,52 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
}
+// Fold
+// y = lshr i64 x, 32
+// res = add (mul i64 y, Constant), x where "Constant" is a 32 bit
+// negative value
+// To
+// res = mad_u64_u32 y.lo ,Constant.lo, x.lo
+static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue MulLHS, SDValue MulRHS,
+ SDValue AddRHS) {
+
+ if (MulLHS.getValueType() != MVT::i64)
+ return SDValue();
+
+ ConstantSDNode *ConstOp;
+ SDValue ShiftOp;
+ if (MulLHS.getOpcode() == ISD::SRL && MulRHS.getOpcode() == ISD::Constant) {
+ ConstOp = cast<ConstantSDNode>(MulRHS.getNode());
+ ShiftOp = MulLHS;
+ } else if (MulRHS.getOpcode() == ISD::SRL &&
+ MulLHS.getOpcode() == ISD::Constant) {
+ ConstOp = cast<ConstantSDNode>(MulLHS.getNode());
+ ShiftOp = MulRHS;
+ } else
+ return SDValue();
+
+ if (ShiftOp.getOperand(1).getOpcode() != ISD::Constant ||
+ AddRHS != ShiftOp.getOperand(0))
+ return SDValue();
+
+ if (cast<ConstantSDNode>(ShiftOp->getOperand(1))->getAsZExtVal() != 32)
+ return SDValue();
+
+ APInt ConstVal = ConstOp->getAPIntValue();
+ if (!ConstVal.isNegative() || !ConstVal.isSignedIntN(33))
+ return SDValue();
+
+ SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ SDValue ConstMul = DAG.getConstant(
+ ConstVal.getZExtValue() & 0x00000000FFFFFFFF, SL, MVT::i32);
+ AddRHS = DAG.getNode(ISD::AND, SL, MVT::i64, AddRHS,
+ DAG.getConstant(0x00000000FFFFFFFF, SL, MVT::i64));
+ return getMad64_32(DAG, SL, MVT::i64,
+ DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
+ AddRHS, false);
+}
+
// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
// multiplies, if any.
//
@@ -13915,6 +13961,9 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
SDValue MulRHS = LHS.getOperand(1);
SDValue AddRHS = RHS;
+ if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
+ return FoldedMAD;
+
// Always check whether operands are small unsigned values, since that
// knowledge is useful in more cases. Check for small signed values only if
// doing so can unlock a shorter code sequence.
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 33007e5b285d80..d8f6eb266fc6ca 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1333,5 +1333,90 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
ret i48 %a
}
+define i64 @lshr_mad_i64(ptr addrspace(1) %1) local_unnamed_addr #0 {
+; CI-LABEL: lshr_mad_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s4, s6
+; CI-NEXT: s_mov_b32 s5, s6
+; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; CI-NEXT: v_mov_b32_e32 v3, 0
+; CI-NEXT: s_movk_i32 s4, 0xd1
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, v0
+; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, s4, v[2:3]
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s6
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_movk_i32 s4, 0xd1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v3, v1, s4
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_movk_i32 s4, 0xd1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, s4, v[2:3]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: global_load_b64 v[1:2], v[0:1], off
+; GFX1100-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xd1, v2, v[3:4]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX1150-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xd1, v1, v[2:3]
+; GFX1150-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xd1, v1, v[2:3]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %3 = load i64, ptr addrspace(1) %1, align 8
+ %4 = lshr i64 %3, 32
+ %5 = mul nsw i64 %4, -4294967087
+ %6 = add nsw i64 %5, %3
+
+ ret i64 %6
+}
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
|
I wouldn't go that far. It fixes a code quality regression caused by that patch in the specific case of the divisor |
right, updated the summary. sorry for the confusion |
@@ -1333,5 +1333,90 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { | |||
ret i48 %a | |||
} | |||
|
|||
define i64 @lshr_mad_i64(ptr addrspace(1) %1) local_unnamed_addr #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please pre-commit the tests so we can see what effect your patch has on the codegen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// with Const.hi == -1 | ||
// To | ||
// res = mad_u64_u32 y.lo ,Const.lo, x.lo | ||
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also implement the same in globalisel? Not sure if the original combine is there, or if that needs porting as well
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure, it seems original combine needs porting
2966e12
to
a885dfc
Compare
Just "mad_u64_u32". This optimization does not use the signed version of the instruction. |
if (ShiftVal->getAsZExtVal() != 32) | ||
return SDValue(); | ||
|
||
APInt Const = dyn_cast<ConstantSDNode>(MulRHS.getNode())->getAPIntValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to check that MulRHS is a constant before calling getAPIntValue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The previous checks (including those at the callsite of this function) make sure the MulRHS is always a ConstantSDNode. so I guess checking should not be necessary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shouldn't be splitting the logic this way, and you should never have an unchecked dyn_cast.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
if (isa<ConstantSDNode>(MulLHS) || isa<ConstantSDNode>(MulRHS)) { | ||
if (MulRHS.getOpcode() == ISD::SRL) | ||
std::swap(MulLHS, MulRHS); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sink all the logic into the implementation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
Ping.. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nit
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/11870 Here is the relevant piece of the build log for the reference
|
The intention is to use a "copy" instead of a "sub" to handle the high parts of 64-bit multiply for this specific case. (ref https://alive2.llvm.org/ce/z/H0iuty and https://godbolt.org/z/ba4sja8jh).
This unlocks copy prop use cases where the copy can be reused by later multiply+add sequences if possible.
Fixes: SWDEV-487672, SWDEV-487669