Skip to content

Commit 11d17a8

Browse files
committed
[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines
This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125.
1 parent f070c74 commit 11d17a8

File tree

3 files changed

+160
-134
lines changed

3 files changed

+160
-134
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6710,7 +6710,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
67106710
return SDValue();
67116711
int64_t Offset = C2->getSExtValue();
67126712
switch (Opcode) {
6713-
case ISD::ADD: break;
6713+
case ISD::ADD:
6714+
case ISD::PTRADD:
6715+
break;
67146716
case ISD::SUB: Offset = -uint64_t(Offset); break;
67156717
default: return SDValue();
67166718
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 109 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/CodeGen/MachineFrameInfo.h"
3434
#include "llvm/CodeGen/MachineFunction.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/SDPatternMatch.h"
3637
#include "llvm/IR/DiagnosticInfo.h"
3738
#include "llvm/IR/IRBuilder.h"
3839
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
4647
#include <optional>
4748

4849
using namespace llvm;
50+
using namespace llvm::SDPatternMatch;
4951

5052
#define DEBUG_TYPE "si-lower"
5153

@@ -14469,7 +14471,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
1446914471
// instead of a tree.
1447014472
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1447114473
DAGCombinerInfo &DCI) const {
14472-
assert(N->getOpcode() == ISD::ADD);
14474+
assert(N->isAnyAdd());
1447314475

1447414476
SelectionDAG &DAG = DCI.DAG;
1447514477
EVT VT = N->getValueType(0);
@@ -14502,7 +14504,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1450214504
for (SDNode *User : LHS->users()) {
1450314505
// There is a use that does not feed into addition, so the multiply can't
1450414506
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14505-
if (User->getOpcode() != ISD::ADD)
14507+
if (!User->isAnyAdd())
1450614508
return SDValue();
1450714509

1450814510
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14614,8 +14616,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
1461414616

1461514617
SDValue Hi = getHiHalf64(LHS, DAG);
1461614618
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14619+
unsigned Opcode = N->getOpcode();
14620+
if (Opcode == ISD::PTRADD)
14621+
Opcode = ISD::ADD;
1461714622
SDValue AddHi =
14618-
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14623+
DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
1461914624

1462014625
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
1462114626
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15089,42 +15094,116 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1508915094
DAGCombinerInfo &DCI) const {
1509015095
SelectionDAG &DAG = DCI.DAG;
1509115096
SDLoc DL(N);
15097+
EVT VT = N->getValueType(0);
1509215098
SDValue N0 = N->getOperand(0);
1509315099
SDValue N1 = N->getOperand(1);
1509415100

15095-
if (N1.getOpcode() == ISD::ADD) {
15096-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15097-
// y is not, and (add y, z) is used only once.
15098-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15099-
// z is not, and (add y, z) is used only once.
15100-
// The goal is to move constant offsets to the outermost ptradd, to create
15101-
// more opportunities to fold offsets into memory instructions.
15102-
// Together with the generic combines in DAGCombiner.cpp, this also
15103-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15104-
//
15105-
// This transform is here instead of in the general DAGCombiner as it can
15106-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15107-
// AArch64's CPA.
15108-
SDValue X = N0;
15109-
SDValue Y = N1.getOperand(0);
15110-
SDValue Z = N1.getOperand(1);
15111-
if (N1.hasOneUse()) {
15112-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15113-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15114-
if (ZIsConstant != YIsConstant) {
15115-
// If both additions in the original were NUW, the new ones are as well.
15116-
SDNodeFlags Flags =
15117-
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15118-
if (YIsConstant)
15119-
std::swap(Y, Z);
15101+
// The following folds transform PTRADDs into regular arithmetic in cases
15102+
// where the PTRADD wouldn't be folded as an immediate offset into memory
15103+
// instructions anyway. They are target-specific in that other targets might
15104+
// prefer to not lose information about the pointer arithmetic.
15105+
15106+
// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15107+
// Adapted from DAGCombiner::visitADDLikeCommutative.
15108+
SDValue V, K;
15109+
if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15110+
SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
15111+
DCI.AddToWorklist(Inner.getNode());
15112+
return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15113+
}
15114+
15115+
// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15116+
// performAddCombine.
15117+
if (N1.getOpcode() == ISD::MUL) {
15118+
if (Subtarget->hasMad64_32()) {
15119+
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15120+
return Folded;
15121+
}
15122+
}
1512015123

15121-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
15124+
// If the 32 low bits of the constant are all zero, there is nothing to fold
15125+
// into an immediate offset, so it's better to eliminate the unnecessary
15126+
// addition for the lower 32 bits than to preserve the PTRADD.
15127+
// Analogous to a fold in performAddCombine.
15128+
if (VT == MVT::i64) {
15129+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15130+
return Folded;
15131+
}
15132+
15133+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15134+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15135+
// global address GA and constant c, such that c can be folded into GA.
15136+
SDValue GAValue = N0.getOperand(0);
15137+
if (const GlobalAddressSDNode *GA =
15138+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
15139+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15140+
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
15141+
// If both additions in the original were NUW, reassociation preserves
15142+
// that.
15143+
SDNodeFlags Flags =
15144+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15145+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
1512215146
DCI.AddToWorklist(Inner.getNode());
15123-
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
15147+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
1512415148
}
1512515149
}
1512615150
}
1512715151

15152+
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15153+
return SDValue();
15154+
15155+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15156+
// y is not, and (add y, z) is used only once.
15157+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15158+
// z is not, and (add y, z) is used only once.
15159+
// The goal is to move constant offsets to the outermost ptradd, to create
15160+
// more opportunities to fold offsets into memory instructions.
15161+
// Together with the generic combines in DAGCombiner.cpp, this also
15162+
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15163+
//
15164+
// This transform is here instead of in the general DAGCombiner as it can
15165+
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15166+
// AArch64's CPA.
15167+
SDValue X = N0;
15168+
SDValue Y = N1.getOperand(0);
15169+
SDValue Z = N1.getOperand(1);
15170+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15171+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15172+
15173+
// If both additions in the original were NUW, reassociation preserves that.
15174+
SDNodeFlags ReassocFlags =
15175+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15176+
15177+
if (ZIsConstant != YIsConstant) {
15178+
if (YIsConstant)
15179+
std::swap(Y, Z);
15180+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15181+
DCI.AddToWorklist(Inner.getNode());
15182+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15183+
}
15184+
15185+
// If one of Y and Z is constant, they have been handled above. If both were
15186+
// constant, the addition would have been folded in SelectionDAG::getNode
15187+
// already. This ensures that the generic DAG combines won't undo the
15188+
// following reassociation.
15189+
assert(!YIsConstant && !ZIsConstant);
15190+
15191+
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15192+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15193+
// y are uniform and z isn't.
15194+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15195+
// z are uniform and y isn't.
15196+
// The goal is to push uniform operands up in the computation, so that they
15197+
// can be handled with scalar operations. We can't use reassociateScalarOps
15198+
// for this since it requires two identical commutative operations to
15199+
// reassociate.
15200+
if (Y->isDivergent())
15201+
std::swap(Y, Z);
15202+
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15203+
DCI.AddToWorklist(UniformInner.getNode());
15204+
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15205+
}
15206+
1512815207
return SDValue();
1512915208
}
1513015209

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 48 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -145,49 +145,29 @@ entry:
145145

146146
; Test skipping the lower-32-bit addition if it is unnecessary.
147147
define ptr @huge_offset_low_32_unused(ptr %p) {
148-
; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
149-
; GFX942_PTRADD: ; %bb.0:
150-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151-
; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
152-
; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
153-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
154-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
155-
;
156-
; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
157-
; GFX942_LEGACY: ; %bb.0:
158-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159-
; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
160-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
148+
; GFX942-LABEL: huge_offset_low_32_unused:
149+
; GFX942: ; %bb.0:
150+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151+
; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
152+
; GFX942-NEXT: s_setpc_b64 s[30:31]
161153
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
162154
ret ptr %gep
163155
}
164156

165157
; Reassociate address computation if it leads to more scalar operations.
166158
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
167-
; GFX942_PTRADD-LABEL: reassoc_scalar_r:
168-
; GFX942_PTRADD: ; %bb.0: ; %entry
169-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
170-
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
171-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
172-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
173-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
174-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
175-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
176-
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
177-
; GFX942_PTRADD-NEXT: s_endpgm
178-
;
179-
; GFX942_LEGACY-LABEL: reassoc_scalar_r:
180-
; GFX942_LEGACY: ; %bb.0: ; %entry
181-
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
183-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
184-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
185-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
186-
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
187-
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
188-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
189-
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
190-
; GFX942_LEGACY-NEXT: s_endpgm
159+
; GFX942-LABEL: reassoc_scalar_r:
160+
; GFX942: ; %bb.0: ; %entry
161+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
162+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
163+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
164+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
165+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
166+
; GFX942-NEXT: s_add_u32 s2, s2, s6
167+
; GFX942-NEXT: s_addc_u32 s3, s3, s7
168+
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
169+
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
170+
; GFX942-NEXT: s_endpgm
191171
entry:
192172
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
193173
%voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
198178
}
199179

200180
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
201-
; GFX942_PTRADD-LABEL: reassoc_scalar_l:
202-
; GFX942_PTRADD: ; %bb.0: ; %entry
203-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
204-
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
205-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
206-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
207-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
208-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
209-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
210-
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
211-
; GFX942_PTRADD-NEXT: s_endpgm
212-
;
213-
; GFX942_LEGACY-LABEL: reassoc_scalar_l:
214-
; GFX942_LEGACY: ; %bb.0: ; %entry
215-
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
216-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
217-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
218-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
219-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
220-
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
221-
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
222-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
223-
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
224-
; GFX942_LEGACY-NEXT: s_endpgm
181+
; GFX942-LABEL: reassoc_scalar_l:
182+
; GFX942: ; %bb.0: ; %entry
183+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
184+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
185+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
186+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
187+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
188+
; GFX942-NEXT: s_add_u32 s2, s2, s6
189+
; GFX942-NEXT: s_addc_u32 s3, s3, s7
190+
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
191+
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
192+
; GFX942-NEXT: s_endpgm
225193
entry:
226194
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
227195
%voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
233201

234202
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
235203
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
236-
; GFX942_PTRADD-LABEL: shl_neg_offset:
237-
; GFX942_PTRADD: ; %bb.0:
238-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239-
; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
240-
; GFX942_PTRADD-NEXT: s_nop 1
241-
; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
242-
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
243-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
244-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
245-
;
246-
; GFX942_LEGACY-LABEL: shl_neg_offset:
247-
; GFX942_LEGACY: ; %bb.0:
248-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249-
; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
250-
; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
251-
; GFX942_LEGACY-NEXT: s_nop 1
252-
; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
253-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
204+
; GFX942-LABEL: shl_neg_offset:
205+
; GFX942: ; %bb.0:
206+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207+
; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
208+
; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
209+
; GFX942-NEXT: s_nop 1
210+
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
211+
; GFX942-NEXT: s_setpc_b64 s[30:31]
254212
%offset = sub i64 0, %noffset
255213
%x = shl i64 %offset, %shift
256214
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
268226
; GFX942_PTRADD: ; %bb.0:
269227
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270228
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
271-
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
272-
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
229+
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
230+
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
273231
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
274-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
275232
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
276233
;
277234
; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
291248

292249
; Tests the tryFoldToMad64_32 PTRADD combine.
293250
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
294-
; GFX942_PTRADD-LABEL: fold_mad64:
295-
; GFX942_PTRADD: ; %bb.0:
296-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
297-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298-
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
299-
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
300-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
301-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
302-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
303-
; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
304-
; GFX942_PTRADD-NEXT: s_endpgm
305-
;
306-
; GFX942_LEGACY-LABEL: fold_mad64:
307-
; GFX942_LEGACY: ; %bb.0:
308-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
309-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
310-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
311-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
312-
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
313-
; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
314-
; GFX942_LEGACY-NEXT: s_endpgm
251+
; GFX942-LABEL: fold_mad64:
252+
; GFX942: ; %bb.0:
253+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
254+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
255+
; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
256+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
257+
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
258+
; GFX942-NEXT: global_store_dword v[0:1], v2, off
259+
; GFX942-NEXT: s_endpgm
315260
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
316261
%voffset = zext i32 %voffset32 to i64
317262
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0

0 commit comments

Comments
 (0)