Skip to content

Commit 17e4584

Browse files
committed
[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines
This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125.
1 parent ee92e37 commit 17e4584

File tree

3 files changed

+160
-134
lines changed

3 files changed

+160
-134
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6710,7 +6710,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
67106710
return SDValue();
67116711
int64_t Offset = C2->getSExtValue();
67126712
switch (Opcode) {
6713-
case ISD::ADD: break;
6713+
case ISD::ADD:
6714+
case ISD::PTRADD:
6715+
break;
67146716
case ISD::SUB: Offset = -uint64_t(Offset); break;
67156717
default: return SDValue();
67166718
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 109 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/CodeGen/MachineFrameInfo.h"
3434
#include "llvm/CodeGen/MachineFunction.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/SDPatternMatch.h"
3637
#include "llvm/IR/DiagnosticInfo.h"
3738
#include "llvm/IR/IRBuilder.h"
3839
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
4647
#include <optional>
4748

4849
using namespace llvm;
50+
using namespace llvm::SDPatternMatch;
4951

5052
#define DEBUG_TYPE "si-lower"
5153

@@ -14480,7 +14482,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
1448014482
// instead of a tree.
1448114483
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1448214484
DAGCombinerInfo &DCI) const {
14483-
assert(N->getOpcode() == ISD::ADD);
14485+
assert(N->isAnyAdd());
1448414486

1448514487
SelectionDAG &DAG = DCI.DAG;
1448614488
EVT VT = N->getValueType(0);
@@ -14513,7 +14515,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1451314515
for (SDNode *User : LHS->users()) {
1451414516
// There is a use that does not feed into addition, so the multiply can't
1451514517
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14516-
if (User->getOpcode() != ISD::ADD)
14518+
if (!User->isAnyAdd())
1451714519
return SDValue();
1451814520

1451914521
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14625,8 +14627,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
1462514627

1462614628
SDValue Hi = getHiHalf64(LHS, DAG);
1462714629
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14630+
unsigned Opcode = N->getOpcode();
14631+
if (Opcode == ISD::PTRADD)
14632+
Opcode = ISD::ADD;
1462814633
SDValue AddHi =
14629-
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14634+
DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
1463014635

1463114636
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
1463214637
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15100,42 +15105,116 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1510015105
DAGCombinerInfo &DCI) const {
1510115106
SelectionDAG &DAG = DCI.DAG;
1510215107
SDLoc DL(N);
15108+
EVT VT = N->getValueType(0);
1510315109
SDValue N0 = N->getOperand(0);
1510415110
SDValue N1 = N->getOperand(1);
1510515111

15106-
if (N1.getOpcode() == ISD::ADD) {
15107-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15108-
// y is not, and (add y, z) is used only once.
15109-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15110-
// z is not, and (add y, z) is used only once.
15111-
// The goal is to move constant offsets to the outermost ptradd, to create
15112-
// more opportunities to fold offsets into memory instructions.
15113-
// Together with the generic combines in DAGCombiner.cpp, this also
15114-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15115-
//
15116-
// This transform is here instead of in the general DAGCombiner as it can
15117-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15118-
// AArch64's CPA.
15119-
SDValue X = N0;
15120-
SDValue Y = N1.getOperand(0);
15121-
SDValue Z = N1.getOperand(1);
15122-
if (N1.hasOneUse()) {
15123-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15124-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15125-
if (ZIsConstant != YIsConstant) {
15126-
// If both additions in the original were NUW, the new ones are as well.
15127-
SDNodeFlags Flags =
15128-
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15129-
if (YIsConstant)
15130-
std::swap(Y, Z);
15112+
// The following folds transform PTRADDs into regular arithmetic in cases
15113+
// where the PTRADD wouldn't be folded as an immediate offset into memory
15114+
// instructions anyway. They are target-specific in that other targets might
15115+
// prefer to not lose information about the pointer arithmetic.
15116+
15117+
// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15118+
// Adapted from DAGCombiner::visitADDLikeCommutative.
15119+
SDValue V, K;
15120+
if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15121+
SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
15122+
DCI.AddToWorklist(Inner.getNode());
15123+
return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15124+
}
15125+
15126+
// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15127+
// performAddCombine.
15128+
if (N1.getOpcode() == ISD::MUL) {
15129+
if (Subtarget->hasMad64_32()) {
15130+
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15131+
return Folded;
15132+
}
15133+
}
1513115134

15132-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
15135+
// If the 32 low bits of the constant are all zero, there is nothing to fold
15136+
// into an immediate offset, so it's better to eliminate the unnecessary
15137+
// addition for the lower 32 bits than to preserve the PTRADD.
15138+
// Analogous to a fold in performAddCombine.
15139+
if (VT == MVT::i64) {
15140+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15141+
return Folded;
15142+
}
15143+
15144+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15145+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15146+
// global address GA and constant c, such that c can be folded into GA.
15147+
SDValue GAValue = N0.getOperand(0);
15148+
if (const GlobalAddressSDNode *GA =
15149+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
15150+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15151+
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
15152+
// If both additions in the original were NUW, reassociation preserves
15153+
// that.
15154+
SDNodeFlags Flags =
15155+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15156+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
1513315157
DCI.AddToWorklist(Inner.getNode());
15134-
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
15158+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
1513515159
}
1513615160
}
1513715161
}
1513815162

15163+
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15164+
return SDValue();
15165+
15166+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15167+
// y is not, and (add y, z) is used only once.
15168+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15169+
// z is not, and (add y, z) is used only once.
15170+
// The goal is to move constant offsets to the outermost ptradd, to create
15171+
// more opportunities to fold offsets into memory instructions.
15172+
// Together with the generic combines in DAGCombiner.cpp, this also
15173+
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15174+
//
15175+
// This transform is here instead of in the general DAGCombiner as it can
15176+
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15177+
// AArch64's CPA.
15178+
SDValue X = N0;
15179+
SDValue Y = N1.getOperand(0);
15180+
SDValue Z = N1.getOperand(1);
15181+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15182+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15183+
15184+
// If both additions in the original were NUW, reassociation preserves that.
15185+
SDNodeFlags ReassocFlags =
15186+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15187+
15188+
if (ZIsConstant != YIsConstant) {
15189+
if (YIsConstant)
15190+
std::swap(Y, Z);
15191+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15192+
DCI.AddToWorklist(Inner.getNode());
15193+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15194+
}
15195+
15196+
// If one of Y and Z is constant, they have been handled above. If both were
15197+
// constant, the addition would have been folded in SelectionDAG::getNode
15198+
// already. This ensures that the generic DAG combines won't undo the
15199+
// following reassociation.
15200+
assert(!YIsConstant && !ZIsConstant);
15201+
15202+
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15203+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15204+
// y are uniform and z isn't.
15205+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15206+
// z are uniform and y isn't.
15207+
// The goal is to push uniform operands up in the computation, so that they
15208+
// can be handled with scalar operations. We can't use reassociateScalarOps
15209+
// for this since it requires two identical commutative operations to
15210+
// reassociate.
15211+
if (Y->isDivergent())
15212+
std::swap(Y, Z);
15213+
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15214+
DCI.AddToWorklist(UniformInner.getNode());
15215+
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15216+
}
15217+
1513915218
return SDValue();
1514015219
}
1514115220

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 48 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -145,49 +145,29 @@ entry:
145145

146146
; Test skipping the lower-32-bit addition if it is unnecessary.
147147
define ptr @huge_offset_low_32_unused(ptr %p) {
148-
; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
149-
; GFX942_PTRADD: ; %bb.0:
150-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151-
; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
152-
; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
153-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
154-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
155-
;
156-
; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
157-
; GFX942_LEGACY: ; %bb.0:
158-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159-
; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
160-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
148+
; GFX942-LABEL: huge_offset_low_32_unused:
149+
; GFX942: ; %bb.0:
150+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151+
; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
152+
; GFX942-NEXT: s_setpc_b64 s[30:31]
161153
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
162154
ret ptr %gep
163155
}
164156

165157
; Reassociate address computation if it leads to more scalar operations.
166158
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
167-
; GFX942_PTRADD-LABEL: reassoc_scalar_r:
168-
; GFX942_PTRADD: ; %bb.0: ; %entry
169-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
170-
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
171-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
172-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
173-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
174-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
175-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
176-
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
177-
; GFX942_PTRADD-NEXT: s_endpgm
178-
;
179-
; GFX942_LEGACY-LABEL: reassoc_scalar_r:
180-
; GFX942_LEGACY: ; %bb.0: ; %entry
181-
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
183-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
184-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
185-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
186-
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
187-
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
188-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
189-
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
190-
; GFX942_LEGACY-NEXT: s_endpgm
159+
; GFX942-LABEL: reassoc_scalar_r:
160+
; GFX942: ; %bb.0: ; %entry
161+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
162+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
163+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
164+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
165+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
166+
; GFX942-NEXT: s_add_u32 s2, s2, s6
167+
; GFX942-NEXT: s_addc_u32 s3, s3, s7
168+
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
169+
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
170+
; GFX942-NEXT: s_endpgm
191171
entry:
192172
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
193173
%voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
198178
}
199179

200180
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
201-
; GFX942_PTRADD-LABEL: reassoc_scalar_l:
202-
; GFX942_PTRADD: ; %bb.0: ; %entry
203-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
204-
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
205-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
206-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
207-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
208-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
209-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
210-
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
211-
; GFX942_PTRADD-NEXT: s_endpgm
212-
;
213-
; GFX942_LEGACY-LABEL: reassoc_scalar_l:
214-
; GFX942_LEGACY: ; %bb.0: ; %entry
215-
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
216-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
217-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
218-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
219-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
220-
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
221-
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
222-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
223-
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
224-
; GFX942_LEGACY-NEXT: s_endpgm
181+
; GFX942-LABEL: reassoc_scalar_l:
182+
; GFX942: ; %bb.0: ; %entry
183+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
184+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
185+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
186+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
187+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
188+
; GFX942-NEXT: s_add_u32 s2, s2, s6
189+
; GFX942-NEXT: s_addc_u32 s3, s3, s7
190+
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
191+
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
192+
; GFX942-NEXT: s_endpgm
225193
entry:
226194
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
227195
%voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
233201

234202
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
235203
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
236-
; GFX942_PTRADD-LABEL: shl_neg_offset:
237-
; GFX942_PTRADD: ; %bb.0:
238-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239-
; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
240-
; GFX942_PTRADD-NEXT: s_nop 1
241-
; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
242-
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
243-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
244-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
245-
;
246-
; GFX942_LEGACY-LABEL: shl_neg_offset:
247-
; GFX942_LEGACY: ; %bb.0:
248-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249-
; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
250-
; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
251-
; GFX942_LEGACY-NEXT: s_nop 1
252-
; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
253-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
204+
; GFX942-LABEL: shl_neg_offset:
205+
; GFX942: ; %bb.0:
206+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207+
; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
208+
; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
209+
; GFX942-NEXT: s_nop 1
210+
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
211+
; GFX942-NEXT: s_setpc_b64 s[30:31]
254212
%offset = sub i64 0, %noffset
255213
%x = shl i64 %offset, %shift
256214
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
268226
; GFX942_PTRADD: ; %bb.0:
269227
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270228
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
271-
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
272-
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
229+
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
230+
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
273231
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
274-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
275232
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
276233
;
277234
; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
291248

292249
; Tests the tryFoldToMad64_32 PTRADD combine.
293250
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
294-
; GFX942_PTRADD-LABEL: fold_mad64:
295-
; GFX942_PTRADD: ; %bb.0:
296-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
297-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298-
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
299-
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
300-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
301-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
302-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
303-
; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
304-
; GFX942_PTRADD-NEXT: s_endpgm
305-
;
306-
; GFX942_LEGACY-LABEL: fold_mad64:
307-
; GFX942_LEGACY: ; %bb.0:
308-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
309-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
310-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
311-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
312-
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
313-
; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
314-
; GFX942_LEGACY-NEXT: s_endpgm
251+
; GFX942-LABEL: fold_mad64:
252+
; GFX942: ; %bb.0:
253+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
254+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
255+
; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
256+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
257+
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
258+
; GFX942-NEXT: global_store_dword v[0:1], v2, off
259+
; GFX942-NEXT: s_endpgm
315260
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
316261
%voffset = zext i32 %voffset32 to i64
317262
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0

0 commit comments

Comments
 (0)