Skip to content

[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines #143673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6710,7 +6710,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
return SDValue();
int64_t Offset = C2->getSExtValue();
switch (Opcode) {
case ISD::ADD: break;
case ISD::ADD:
case ISD::PTRADD:
break;
case ISD::SUB: Offset = -uint64_t(Offset); break;
default: return SDValue();
}
Expand Down
139 changes: 109 additions & 30 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
Expand All @@ -46,6 +47,7 @@
#include <optional>

using namespace llvm;
using namespace llvm::SDPatternMatch;

#define DEBUG_TYPE "si-lower"

Expand Down Expand Up @@ -14469,7 +14471,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::ADD);
assert(N->isAnyAdd());

SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
Expand Down Expand Up @@ -14502,7 +14504,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
for (SDNode *User : LHS->users()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
if (User->getOpcode() != ISD::ADD)
if (!User->isAnyAdd())
return SDValue();

// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
Expand Down Expand Up @@ -14614,8 +14616,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,

SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::PTRADD)
Opcode = ISD::ADD;
SDValue AddHi =
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());

SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
Expand Down Expand Up @@ -15089,42 +15094,116 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

if (N1.getOpcode() == ISD::ADD) {
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
// y is not, and (add y, z) is used only once.
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
// z is not, and (add y, z) is used only once.
// The goal is to move constant offsets to the outermost ptradd, to create
// more opportunities to fold offsets into memory instructions.
// Together with the generic combines in DAGCombiner.cpp, this also
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
//
// This transform is here instead of in the general DAGCombiner as it can
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
// AArch64's CPA.
SDValue X = N0;
SDValue Y = N1.getOperand(0);
SDValue Z = N1.getOperand(1);
if (N1.hasOneUse()) {
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
if (ZIsConstant != YIsConstant) {
// If both additions in the original were NUW, the new ones are as well.
SDNodeFlags Flags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
if (YIsConstant)
std::swap(Y, Z);
// The following folds transform PTRADDs into regular arithmetic in cases
// where the PTRADD wouldn't be folded as an immediate offset into memory
// instructions anyway. They are target-specific in that other targets might
// prefer to not lose information about the pointer arithmetic.

// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
// Adapted from DAGCombiner::visitADDLikeCommutative.
SDValue V, K;
if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
DCI.AddToWorklist(Inner.getNode());
return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
}

// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
// performAddCombine.
if (N1.getOpcode() == ISD::MUL) {
if (Subtarget->hasMad64_32()) {
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
return Folded;
}
}

SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
// If the 32 low bits of the constant are all zero, there is nothing to fold
// into an immediate offset, so it's better to eliminate the unnecessary
// addition for the lower 32 bits than to preserve the PTRADD.
// Analogous to a fold in performAddCombine.
if (VT == MVT::i64) {
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
return Folded;
}

if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
// global address GA and constant c, such that c can be folded into GA.
SDValue GAValue = N0.getOperand(0);
if (const GlobalAddressSDNode *GA =
dyn_cast<GlobalAddressSDNode>(GAValue)) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
// If both additions in the original were NUW, reassociation preserves
// that.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
DCI.AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
}
}
}

if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
return SDValue();

// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
// y is not, and (add y, z) is used only once.
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
// z is not, and (add y, z) is used only once.
// The goal is to move constant offsets to the outermost ptradd, to create
// more opportunities to fold offsets into memory instructions.
// Together with the generic combines in DAGCombiner.cpp, this also
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
//
// This transform is here instead of in the general DAGCombiner as it can
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
// AArch64's CPA.
SDValue X = N0;
SDValue Y = N1.getOperand(0);
SDValue Z = N1.getOperand(1);
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// If both additions in the original were NUW, reassociation preserves that.
SDNodeFlags ReassocFlags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;

if (ZIsConstant != YIsConstant) {
if (YIsConstant)
std::swap(Y, Z);
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
DCI.AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
}

// If one of Y and Z is constant, they have been handled above. If both were
// constant, the addition would have been folded in SelectionDAG::getNode
// already. This ensures that the generic DAG combines won't undo the
// following reassociation.
assert(!YIsConstant && !ZIsConstant);

if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
// y are uniform and z isn't.
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
// z are uniform and y isn't.
// The goal is to push uniform operands up in the computation, so that they
// can be handled with scalar operations. We can't use reassociateScalarOps
// for this since it requires two identical commutative operations to
// reassociate.
if (Y->isDivergent())
std::swap(Y, Z);
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
DCI.AddToWorklist(UniformInner.getNode());
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
}

return SDValue();
}

Expand Down
151 changes: 48 additions & 103 deletions llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
Original file line number Diff line number Diff line change
Expand Up @@ -145,49 +145,29 @@ entry:

; Test skipping the lower-32-bit addition if it is unnecessary.
define ptr @huge_offset_low_32_unused(ptr %p) {
; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
; GFX942-LABEL: huge_offset_low_32_unused:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
ret ptr %gep
}

; Reassociate address computation if it leads to more scalar operations.
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
; GFX942_PTRADD-LABEL: reassoc_scalar_r:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942_PTRADD-NEXT: s_endpgm
;
; GFX942_LEGACY-LABEL: reassoc_scalar_r:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942_LEGACY-NEXT: s_endpgm
; GFX942-LABEL: reassoc_scalar_r:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_add_u32 s2, s2, s6
; GFX942-NEXT: s_addc_u32 s3, s3, s7
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
Expand All @@ -198,30 +178,18 @@ entry:
}

define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
; GFX942_PTRADD-LABEL: reassoc_scalar_l:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942_PTRADD-NEXT: s_endpgm
;
; GFX942_LEGACY-LABEL: reassoc_scalar_l:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942_LEGACY-NEXT: s_endpgm
; GFX942-LABEL: reassoc_scalar_l:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_add_u32 s2, s2, s6
; GFX942-NEXT: s_addc_u32 s3, s3, s7
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
Expand All @@ -233,24 +201,14 @@ entry:

; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
; GFX942_PTRADD-LABEL: shl_neg_offset:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
; GFX942_PTRADD-NEXT: s_nop 1
; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: shl_neg_offset:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
; GFX942_LEGACY-NEXT: s_nop 1
; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
; GFX942-LABEL: shl_neg_offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
%offset = sub i64 0, %noffset
%x = shl i64 %offset, %shift
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
Expand All @@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: complextype_global_gep:
Expand All @@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {

; Tests the tryFoldToMad64_32 PTRADD combine.
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
; GFX942_PTRADD-LABEL: fold_mad64:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
; GFX942_PTRADD-NEXT: s_endpgm
;
; GFX942_LEGACY-LABEL: fold_mad64:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
; GFX942_LEGACY-NEXT: s_endpgm
; GFX942-LABEL: fold_mad64:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
; GFX942-NEXT: global_store_dword v[0:1], v2, off
; GFX942-NEXT: s_endpgm
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
Expand Down