-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][SDAG] Add target-specific ISD::PTRADD combines #143673
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/ritter-x2a/06-11-_amdgpu_sdag_tests_for_target-specific_isd_ptradd_combines
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) ChangesThis patch adds several (AMDGPU-)target-specific DAG combines for For SWDEV-516125. Full diff: https://github.com/llvm/llvm-project/pull/143673.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 526a395181764..cc24585b4e4ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6765,7 +6765,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
return SDValue();
int64_t Offset = C2->getSExtValue();
switch (Opcode) {
- case ISD::ADD: break;
+ case ISD::ADD:
+ case ISD::PTRADD:
+ break;
case ISD::SUB: Offset = -uint64_t(Offset); break;
default: return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0708c76bcb3fc..6149d7ca84354 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "si-lower"
@@ -14320,7 +14322,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
- assert(N->getOpcode() == ISD::ADD);
+ assert(N->isAnyAdd());
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -14353,7 +14355,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
for (SDNode *User : LHS->users()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
- if (User->getOpcode() != ISD::ADD)
+ if (!User->isAnyAdd())
return SDValue();
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14465,8 +14467,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::PTRADD)
+ Opcode = ISD::ADD;
SDValue AddHi =
- DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+ DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -14940,44 +14945,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() == ISD::ADD) {
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
- // y is not, and (add y, z) is used only once.
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
- // z is not, and (add y, z) is used only once.
- // The goal is to move constant offsets to the outermost ptradd, to create
- // more opportunities to fold offsets into memory instructions.
- // Together with the generic combines in DAGCombiner.cpp, this also
- // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
- //
- // This transform is here instead of in the general DAGCombiner as it can
- // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
- // AArch64's CPA.
- SDValue X = N0;
- SDValue Y = N1.getOperand(0);
- SDValue Z = N1.getOperand(1);
- bool N1OneUse = N1.hasOneUse();
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
- if ((ZIsConstant != YIsConstant) && N1OneUse) {
- SDNodeFlags Flags;
- // If both additions in the original were NUW, the new ones are as well.
- if (N->getFlags().hasNoUnsignedWrap() &&
- N1->getFlags().hasNoUnsignedWrap())
- Flags |= SDNodeFlags::NoUnsignedWrap;
-
- if (YIsConstant)
- std::swap(Y, Z);
-
- SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
- DCI.AddToWorklist(Inner.getNode());
- return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ // The following folds transform PTRADDs into regular arithmetic in cases
+ // where the PTRADD wouldn't be folded as an immediate offset into memory
+ // instructions anyway. They are target-specific in that other targets might
+ // prefer to not lose information about the pointer arithmetic.
+
+ // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+ // Adapted from DAGCombiner::visitADDLikeCommutative.
+ SDValue V, K;
+ if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+ SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+ }
+
+ // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+ // performAddCombine.
+ if (N1.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
+ }
+ }
+
+ // If the 32 low bits of the constant are all zero, there is nothing to fold
+ // into an immediate offset, so it's better to eliminate the unnecessary
+ // addition for the lower 32 bits than to preserve the PTRADD.
+ // Analogous to a fold in performAddCombine.
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
+
+ if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+ // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+ // global address GA and constant c, such that c can be folded into GA.
+ SDValue GAValue = N0.getOperand(0);
+ if (const GlobalAddressSDNode *GA =
+ dyn_cast<GlobalAddressSDNode>(GAValue)) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, reassociation preserves
+ // that.
+ if (N->getFlags().hasNoUnsignedWrap() &&
+ N0->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+ SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
+ }
}
}
+ if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+ return SDValue();
+
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ SDNodeFlags ReassocFlags;
+ // If both additions in the original were NUW, reassociation preserves that.
+ if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap())
+ ReassocFlags |= SDNodeFlags::NoUnsignedWrap;
+ if (ZIsConstant != YIsConstant) {
+
+ if (YIsConstant)
+ std::swap(Y, Z);
+
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+ }
+
+ // If one of Y and Z is constant, they have been handled above. If both were
+ // constant, the addition would have been folded in SelectionDAG::getNode
+ // already. This ensures that the generic DAG combines won't undo the
+ // following reassociation.
+ assert(!YIsConstant && !ZIsConstant);
+
+ if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+ // y are uniform and z isn't.
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+ // z are uniform and y isn't.
+ // The goal is to push uniform operands up in the computation, so that they
+ // can be handled with scalar operations. We can't use reassociateScalarOps
+ // for this since it requires two identical commutative operations to
+ // reassociate.
+ if (Y->isDivergent())
+ std::swap(Y, Z);
+ SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(UniformInner.getNode());
+ return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 1ec94162951a6..c00bccdbce6b7 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -145,49 +145,29 @@ entry:
; Test skipping the lower-32-bit addition if it is unnecessary.
define ptr @huge_offset_low_32_unused(ptr %p) {
-; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
-; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: huge_offset_low_32_unused:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
ret ptr %gep
}
; Reassociate address computation if it leads to more scalar operations.
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_r:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_r:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: reassoc_scalar_r:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_u32 s2, s2, s6
+; GFX942-NEXT: s_addc_u32 s3, s3, s7
+; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
}
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_l:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_l:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: reassoc_scalar_l:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_u32 s2, s2, s6
+; GFX942-NEXT: s_addc_u32 s3, s3, s7
+; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
-; GFX942_PTRADD-LABEL: shl_neg_offset:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: shl_neg_offset:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: shl_neg_offset:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%offset = sub i64 0, %noffset
%x = shl i64 %offset, %shift
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
-; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
-; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
+; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
+; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
; Tests the tryFoldToMad64_32 PTRADD combine.
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
-; GFX942_PTRADD-LABEL: fold_mad64:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
-; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: fold_mad64:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
-; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: fold_mad64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; GFX942-NEXT: global_store_dword v[0:1], v2, off
+; GFX942-NEXT: s_endpgm
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
|
b209c96
to
c9cbbce
Compare
a3d204e
to
50de6e0
Compare
c9cbbce
to
3774765
Compare
50de6e0
to
7eb2283
Compare
a448f7e
to
a3f6f9e
Compare
7eb2283
to
88860bc
Compare
a3f6f9e
to
ac6d5eb
Compare
88860bc
to
10494be
Compare
ac6d5eb
to
ee92e37
Compare
10494be
to
17e4584
Compare
ee92e37
to
7477753
Compare
This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125.
17e4584
to
91bd709
Compare
This patch adds several (AMDGPU-)target-specific DAG combines for
ISD::PTRADD nodes that reproduce existing similar transforms for
ISD::ADD nodes. There is no functional change intended for the existing
target-specific PTRADD combine.
For SWDEV-516125.