Skip to content

[AMDGPU][SDAG] Try folding "lshr i64 + mad" to "mad_u64_u32" #119218

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 17, 2025

Conversation

vikramRH
Copy link
Contributor

@vikramRH vikramRH commented Dec 9, 2024

The intention is to use a "copy" instead of a "sub" to handle the high parts of 64-bit multiply for this specific case. (ref https://alive2.llvm.org/ce/z/H0iuty and https://godbolt.org/z/ba4sja8jh).

This unlocks copy prop use cases where the copy can be reused by later multiply+add sequences if possible.

Fixes: SWDEV-487672, SWDEV-487669

@llvmbot
Copy link
Member

llvmbot commented Dec 9, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Vikram Hegde (vikramRH)

Changes

This is essentially a revert of #76285. The intention is to use a "copy" instead of a "sub" to handle the high parts of 64-bit multiply for this specific case. (ref https://alive2.llvm.org/ce/z/H0iuty and https://godbolt.org/z/ba4sja8jh).

This unlocks copy prop use cases where the copy can be reused by later multiply+add sequences if possible.

Fixes: SWDEV-487672


Full diff: https://github.com/llvm/llvm-project/pull/119218.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+49)
  • (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+85)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f89fe8faa600ba..c18db9f65dc08e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13857,6 +13857,52 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
 }
 
+// Fold
+//     y = lshr i64 x, 32
+//     res = add (mul i64 y, Constant), x   where "Constant" is a 32 bit
+//     negative value
+// To
+//     res = mad_u64_u32 y.lo ,Constant.lo, x.lo
+static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
+                                 SDValue MulLHS, SDValue MulRHS,
+                                 SDValue AddRHS) {
+
+  if (MulLHS.getValueType() != MVT::i64)
+    return SDValue();
+
+  ConstantSDNode *ConstOp;
+  SDValue ShiftOp;
+  if (MulLHS.getOpcode() == ISD::SRL && MulRHS.getOpcode() == ISD::Constant) {
+    ConstOp = cast<ConstantSDNode>(MulRHS.getNode());
+    ShiftOp = MulLHS;
+  } else if (MulRHS.getOpcode() == ISD::SRL &&
+             MulLHS.getOpcode() == ISD::Constant) {
+    ConstOp = cast<ConstantSDNode>(MulLHS.getNode());
+    ShiftOp = MulRHS;
+  } else
+    return SDValue();
+
+  if (ShiftOp.getOperand(1).getOpcode() != ISD::Constant ||
+      AddRHS != ShiftOp.getOperand(0))
+    return SDValue();
+
+  if (cast<ConstantSDNode>(ShiftOp->getOperand(1))->getAsZExtVal() != 32)
+    return SDValue();
+
+  APInt ConstVal = ConstOp->getAPIntValue();
+  if (!ConstVal.isNegative() || !ConstVal.isSignedIntN(33))
+    return SDValue();
+
+  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  SDValue ConstMul = DAG.getConstant(
+      ConstVal.getZExtValue() & 0x00000000FFFFFFFF, SL, MVT::i32);
+  AddRHS = DAG.getNode(ISD::AND, SL, MVT::i64, AddRHS,
+                       DAG.getConstant(0x00000000FFFFFFFF, SL, MVT::i64));
+  return getMad64_32(DAG, SL, MVT::i64,
+                     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
+                     AddRHS, false);
+}
+
 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
 // multiplies, if any.
 //
@@ -13915,6 +13961,9 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   SDValue MulRHS = LHS.getOperand(1);
   SDValue AddRHS = RHS;
 
+  if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
+    return FoldedMAD;
+
   // Always check whether operands are small unsigned values, since that
   // knowledge is useful in more cases. Check for small signed values only if
   // doing so can unlock a shorter code sequence.
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 33007e5b285d80..d8f6eb266fc6ca 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1333,5 +1333,90 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
   ret i48 %a
 }
 
+define i64 @lshr_mad_i64(ptr addrspace(1) %1) local_unnamed_addr #0 {
+; CI-LABEL: lshr_mad_i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s4, s6
+; CI-NEXT:    s_mov_b32 s5, s6
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v2, v0
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, s4, v[2:3]
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s6
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, s4, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX1100-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, 0xd1, v2, v[3:4]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xd1, v1, v[2:3]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xd1, v1, v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %3 = load i64, ptr addrspace(1) %1, align 8
+  %4 = lshr i64 %3, 32
+  %5 = mul nsw i64 %4, -4294967087
+  %6 = add nsw i64 %5, %3
+
+  ret i64 %6
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }

@jayfoad
Copy link
Contributor

jayfoad commented Dec 9, 2024

This is essentially a revert of #76285.

I wouldn't go that far. It fixes a code quality regression caused by that patch in the specific case of the divisor $C0$ being $2^{32}$.

@vikramRH
Copy link
Contributor Author

vikramRH commented Dec 9, 2024

This is essentially a revert of #76285.

I wouldn't go that far. It fixes a code quality regression caused by that patch in the specific case of the divisor C 0 being 2 32 .

right, updated the summary. sorry for the confusion

@@ -1333,5 +1333,90 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
ret i48 %a
}

define i64 @lshr_mad_i64(ptr addrspace(1) %1) local_unnamed_addr #0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please pre-commit the tests so we can see what effect your patch has on the codegen.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// with Const.hi == -1
// To
// res = mad_u64_u32 y.lo ,Const.lo, x.lo
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also implement the same in globalisel? Not sure if the original combine is there, or if that needs porting as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, it seems original combine needs porting

@jayfoad
Copy link
Contributor

jayfoad commented Jan 2, 2025

Try folding "lshr i64 + mad" to "mad_[iu]64_[iu]32"

Just "mad_u64_u32". This optimization does not use the signed version of the instruction.

if (ShiftVal->getAsZExtVal() != 32)
return SDValue();

APInt Const = dyn_cast<ConstantSDNode>(MulRHS.getNode())->getAPIntValue();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to check that MulRHS is a constant before calling getAPIntValue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous checks (including those at the callsite of this function) make sure the MulRHS is always a ConstantSDNode. so I guess checking should not be necessary

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't be splitting the logic this way, and you should never have an unchecked dyn_cast.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

@vikramRH vikramRH changed the title [AMDGPU][SDAG] Try folding "lshr i64 + mad" to "mad_[iu]64_[iu]32" [AMDGPU][SDAG] Try folding "lshr i64 + mad" to "mad_u64_u32" Jan 6, 2025
Comment on lines 13948 to 13950
if (isa<ConstantSDNode>(MulLHS) || isa<ConstantSDNode>(MulRHS)) {
if (MulRHS.getOpcode() == ISD::SRL)
std::swap(MulLHS, MulRHS);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sink all the logic into the implementation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

@vikramRH
Copy link
Contributor Author

vikramRH commented Jan 9, 2025

Ping..

Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with nit

@vikramRH vikramRH merged commit 225fc4f into llvm:main Jan 17, 2025
8 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Jan 17, 2025

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/11870

Here is the relevant piece of the build log for the reference
Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/pgo1.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate      -Xclang "-fprofile-instrument=clang"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate -Xclang -fprofile-instrument=clang
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c      --check-prefix="CLANG-PGO"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c --check-prefix=CLANG-PGO
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c:32:20: error: CLANG-PGO-NEXT: expected string not found in input
# | // CLANG-PGO-NEXT: [ 0 11 20 ]
# |                    ^
# | <stdin>:3:28: note: scanning from here
# | ======== Counters =========
# |                            ^
# | <stdin>:4:1: note: possible intended match here
# | [ 0 14 20 ]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |            1: ======= GPU Profile ======= 
# |            2: Target: amdgcn-amd-amdhsa 
# |            3: ======== Counters ========= 
# | next:32'0                                X error: no match found
# |            4: [ 0 14 20 ] 
# | next:32'0     ~~~~~~~~~~~~
# | next:32'1     ?            possible intended match
# |            5: [ 10 ] 
# | next:32'0     ~~~~~~~
# |            6: [ 20 ] 
# | next:32'0     ~~~~~~~
# |            7: ========== Data =========== 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            8: { 9515997471539760012 4749112401 0xffffffffffffffd8 0x0 0x0 0x0 3 [...] 0 } 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            9: { 3666282617048535130 24 0xffffffffffffffb0 0x0 0x0 0x0 1 [...] 0 } 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            .
# |            .
# |            .
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants