Skip to content

Commit 88860bc

Browse files
committed
[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines
This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125.
1 parent a3f6f9e commit 88860bc

File tree

3 files changed

+160
-134
lines changed

3 files changed

+160
-134
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
67066706
return SDValue();
67076707
int64_t Offset = C2->getSExtValue();
67086708
switch (Opcode) {
6709-
case ISD::ADD: break;
6709+
case ISD::ADD:
6710+
case ISD::PTRADD:
6711+
break;
67106712
case ISD::SUB: Offset = -uint64_t(Offset); break;
67116713
default: return SDValue();
67126714
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 109 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/CodeGen/MachineFrameInfo.h"
3434
#include "llvm/CodeGen/MachineFunction.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/SDPatternMatch.h"
3637
#include "llvm/IR/DiagnosticInfo.h"
3738
#include "llvm/IR/IRBuilder.h"
3839
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
4647
#include <optional>
4748

4849
using namespace llvm;
50+
using namespace llvm::SDPatternMatch;
4951

5052
#define DEBUG_TYPE "si-lower"
5153

@@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
1432914331
// instead of a tree.
1433014332
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1433114333
DAGCombinerInfo &DCI) const {
14332-
assert(N->getOpcode() == ISD::ADD);
14334+
assert(N->isAnyAdd());
1433314335

1433414336
SelectionDAG &DAG = DCI.DAG;
1433514337
EVT VT = N->getValueType(0);
@@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1436214364
for (SDNode *User : LHS->users()) {
1436314365
// There is a use that does not feed into addition, so the multiply can't
1436414366
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14365-
if (User->getOpcode() != ISD::ADD)
14367+
if (!User->isAnyAdd())
1436614368
return SDValue();
1436714369

1436814370
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
1447414476

1447514477
SDValue Hi = getHiHalf64(LHS, DAG);
1447614478
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14479+
unsigned Opcode = N->getOpcode();
14480+
if (Opcode == ISD::PTRADD)
14481+
Opcode = ISD::ADD;
1447714482
SDValue AddHi =
14478-
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14483+
DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
1447914484

1448014485
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
1448114486
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -14949,42 +14954,116 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1494914954
DAGCombinerInfo &DCI) const {
1495014955
SelectionDAG &DAG = DCI.DAG;
1495114956
SDLoc DL(N);
14957+
EVT VT = N->getValueType(0);
1495214958
SDValue N0 = N->getOperand(0);
1495314959
SDValue N1 = N->getOperand(1);
1495414960

14955-
if (N1.getOpcode() == ISD::ADD) {
14956-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
14957-
// y is not, and (add y, z) is used only once.
14958-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
14959-
// z is not, and (add y, z) is used only once.
14960-
// The goal is to move constant offsets to the outermost ptradd, to create
14961-
// more opportunities to fold offsets into memory instructions.
14962-
// Together with the generic combines in DAGCombiner.cpp, this also
14963-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
14964-
//
14965-
// This transform is here instead of in the general DAGCombiner as it can
14966-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
14967-
// AArch64's CPA.
14968-
SDValue X = N0;
14969-
SDValue Y = N1.getOperand(0);
14970-
SDValue Z = N1.getOperand(1);
14971-
if (N1.hasOneUse()) {
14972-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
14973-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
14974-
if (ZIsConstant != YIsConstant) {
14975-
// If both additions in the original were NUW, the new ones are as well.
14976-
SDNodeFlags Flags =
14977-
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
14978-
if (YIsConstant)
14979-
std::swap(Y, Z);
14961+
// The following folds transform PTRADDs into regular arithmetic in cases
14962+
// where the PTRADD wouldn't be folded as an immediate offset into memory
14963+
// instructions anyway. They are target-specific in that other targets might
14964+
// prefer to not lose information about the pointer arithmetic.
14965+
14966+
// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
14967+
// Adapted from DAGCombiner::visitADDLikeCommutative.
14968+
SDValue V, K;
14969+
if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
14970+
SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
14971+
DCI.AddToWorklist(Inner.getNode());
14972+
return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
14973+
}
14974+
14975+
// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
14976+
// performAddCombine.
14977+
if (N1.getOpcode() == ISD::MUL) {
14978+
if (Subtarget->hasMad64_32()) {
14979+
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14980+
return Folded;
14981+
}
14982+
}
1498014983

14981-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
14984+
// If the 32 low bits of the constant are all zero, there is nothing to fold
14985+
// into an immediate offset, so it's better to eliminate the unnecessary
14986+
// addition for the lower 32 bits than to preserve the PTRADD.
14987+
// Analogous to a fold in performAddCombine.
14988+
if (VT == MVT::i64) {
14989+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14990+
return Folded;
14991+
}
14992+
14993+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
14994+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
14995+
// global address GA and constant c, such that c can be folded into GA.
14996+
SDValue GAValue = N0.getOperand(0);
14997+
if (const GlobalAddressSDNode *GA =
14998+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
14999+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15000+
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
15001+
// If both additions in the original were NUW, reassociation preserves
15002+
// that.
15003+
SDNodeFlags Flags =
15004+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15005+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
1498215006
DCI.AddToWorklist(Inner.getNode());
14983-
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
15007+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
1498415008
}
1498515009
}
1498615010
}
1498715011

15012+
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15013+
return SDValue();
15014+
15015+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15016+
// y is not, and (add y, z) is used only once.
15017+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15018+
// z is not, and (add y, z) is used only once.
15019+
// The goal is to move constant offsets to the outermost ptradd, to create
15020+
// more opportunities to fold offsets into memory instructions.
15021+
// Together with the generic combines in DAGCombiner.cpp, this also
15022+
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15023+
//
15024+
// This transform is here instead of in the general DAGCombiner as it can
15025+
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15026+
// AArch64's CPA.
15027+
SDValue X = N0;
15028+
SDValue Y = N1.getOperand(0);
15029+
SDValue Z = N1.getOperand(1);
15030+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15031+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15032+
15033+
// If both additions in the original were NUW, reassociation preserves that.
15034+
SDNodeFlags ReassocFlags =
15035+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15036+
15037+
if (ZIsConstant != YIsConstant) {
15038+
if (YIsConstant)
15039+
std::swap(Y, Z);
15040+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15041+
DCI.AddToWorklist(Inner.getNode());
15042+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15043+
}
15044+
15045+
// If one of Y and Z is constant, they have been handled above. If both were
15046+
// constant, the addition would have been folded in SelectionDAG::getNode
15047+
// already. This ensures that the generic DAG combines won't undo the
15048+
// following reassociation.
15049+
assert(!YIsConstant && !ZIsConstant);
15050+
15051+
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15052+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15053+
// y are uniform and z isn't.
15054+
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15055+
// z are uniform and y isn't.
15056+
// The goal is to push uniform operands up in the computation, so that they
15057+
// can be handled with scalar operations. We can't use reassociateScalarOps
15058+
// for this since it requires two identical commutative operations to
15059+
// reassociate.
15060+
if (Y->isDivergent())
15061+
std::swap(Y, Z);
15062+
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15063+
DCI.AddToWorklist(UniformInner.getNode());
15064+
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15065+
}
15066+
1498815067
return SDValue();
1498915068
}
1499015069

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 48 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -145,49 +145,29 @@ entry:
145145

146146
; Test skipping the lower-32-bit addition if it is unnecessary.
147147
define ptr @huge_offset_low_32_unused(ptr %p) {
148-
; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
149-
; GFX942_PTRADD: ; %bb.0:
150-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151-
; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
152-
; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
153-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
154-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
155-
;
156-
; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
157-
; GFX942_LEGACY: ; %bb.0:
158-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159-
; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
160-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
148+
; GFX942-LABEL: huge_offset_low_32_unused:
149+
; GFX942: ; %bb.0:
150+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151+
; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
152+
; GFX942-NEXT: s_setpc_b64 s[30:31]
161153
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
162154
ret ptr %gep
163155
}
164156

165157
; Reassociate address computation if it leads to more scalar operations.
166158
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
167-
; GFX942_PTRADD-LABEL: reassoc_scalar_r:
168-
; GFX942_PTRADD: ; %bb.0: ; %entry
169-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
170-
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
171-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
172-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
173-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
174-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
175-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
176-
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
177-
; GFX942_PTRADD-NEXT: s_endpgm
178-
;
179-
; GFX942_LEGACY-LABEL: reassoc_scalar_r:
180-
; GFX942_LEGACY: ; %bb.0: ; %entry
181-
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
183-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
184-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
185-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
186-
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
187-
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
188-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
189-
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
190-
; GFX942_LEGACY-NEXT: s_endpgm
159+
; GFX942-LABEL: reassoc_scalar_r:
160+
; GFX942: ; %bb.0: ; %entry
161+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
162+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
163+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
164+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
165+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
166+
; GFX942-NEXT: s_add_u32 s2, s2, s6
167+
; GFX942-NEXT: s_addc_u32 s3, s3, s7
168+
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
169+
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
170+
; GFX942-NEXT: s_endpgm
191171
entry:
192172
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
193173
%voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
198178
}
199179

200180
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
201-
; GFX942_PTRADD-LABEL: reassoc_scalar_l:
202-
; GFX942_PTRADD: ; %bb.0: ; %entry
203-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
204-
; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
205-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
206-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
207-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
208-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
209-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
210-
; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
211-
; GFX942_PTRADD-NEXT: s_endpgm
212-
;
213-
; GFX942_LEGACY-LABEL: reassoc_scalar_l:
214-
; GFX942_LEGACY: ; %bb.0: ; %entry
215-
; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
216-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
217-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
218-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
219-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
220-
; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
221-
; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
222-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
223-
; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
224-
; GFX942_LEGACY-NEXT: s_endpgm
181+
; GFX942-LABEL: reassoc_scalar_l:
182+
; GFX942: ; %bb.0: ; %entry
183+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
184+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
185+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
186+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
187+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
188+
; GFX942-NEXT: s_add_u32 s2, s2, s6
189+
; GFX942-NEXT: s_addc_u32 s3, s3, s7
190+
; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
191+
; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
192+
; GFX942-NEXT: s_endpgm
225193
entry:
226194
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
227195
%voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
233201

234202
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
235203
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
236-
; GFX942_PTRADD-LABEL: shl_neg_offset:
237-
; GFX942_PTRADD: ; %bb.0:
238-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239-
; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
240-
; GFX942_PTRADD-NEXT: s_nop 1
241-
; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
242-
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
243-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
244-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
245-
;
246-
; GFX942_LEGACY-LABEL: shl_neg_offset:
247-
; GFX942_LEGACY: ; %bb.0:
248-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249-
; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
250-
; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
251-
; GFX942_LEGACY-NEXT: s_nop 1
252-
; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
253-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
204+
; GFX942-LABEL: shl_neg_offset:
205+
; GFX942: ; %bb.0:
206+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207+
; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
208+
; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
209+
; GFX942-NEXT: s_nop 1
210+
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
211+
; GFX942-NEXT: s_setpc_b64 s[30:31]
254212
%offset = sub i64 0, %noffset
255213
%x = shl i64 %offset, %shift
256214
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
268226
; GFX942_PTRADD: ; %bb.0:
269227
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270228
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
271-
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
272-
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
229+
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
230+
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
273231
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
274-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
275232
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
276233
;
277234
; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
291248

292249
; Tests the tryFoldToMad64_32 PTRADD combine.
293250
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
294-
; GFX942_PTRADD-LABEL: fold_mad64:
295-
; GFX942_PTRADD: ; %bb.0:
296-
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
297-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298-
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
299-
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
300-
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
301-
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
302-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
303-
; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
304-
; GFX942_PTRADD-NEXT: s_endpgm
305-
;
306-
; GFX942_LEGACY-LABEL: fold_mad64:
307-
; GFX942_LEGACY: ; %bb.0:
308-
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
309-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
310-
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
311-
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
312-
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
313-
; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
314-
; GFX942_LEGACY-NEXT: s_endpgm
251+
; GFX942-LABEL: fold_mad64:
252+
; GFX942: ; %bb.0:
253+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
254+
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
255+
; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
256+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
257+
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
258+
; GFX942-NEXT: global_store_dword v[0:1], v2, off
259+
; GFX942-NEXT: s_endpgm
315260
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
316261
%voffset = zext i32 %voffset32 to i64
317262
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0

0 commit comments

Comments
 (0)