Skip to content

Commit 4adc54a

Browse files
committed
[AMDGPU][SDAG] Only fold flat offsets if they are inbounds
For flat memory instructions where the address is supplied as a base address register with an immediate offset, the memory aperture test ignores the immediate offset. Currently, ISel does not respect that, which leads to miscompilations where valid input programs crash when the address computation relies on the immediate offset to get the base address in the proper memory aperture. Global or scratch instructions are not affected. This patch only selects flat instructions with immediate offsets from address computations with the inbounds flag: If the address computation does not leave the bounds of the allocated object, it cannot leave the bounds of the memory aperture and is therefore safe to handle with an immediate offset. It also adds the inbounds flag to DAG nodes resulting from transformations: - Address computations resulting from getObjectPtrOffset. As far as I can tell, this function is only used to compute addresses within accessed memory ranges, e.g., for loads and stores that are split during legalization. - Reassociated inbounds adds. If both involved operations are inbounds, then so are operations after the transformation. - Address computations in the SelectionDAG lowering of the memcpy/move/set intrinsics. Base and result of the address arithmetic there are accessed, so the operation must be inbounds. It might make sense to separate these changes into their own PR, but I don't see a way to test them without adding a use of the inbounds SDAG flag. Affected tests: - CodeGen/AMDGPU/fold-gep-offset.ll: Offsets are no longer wrongly folded, added new positive tests where we still do fold them. - Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll: Offset folding doesn't seem integral to this test, so the test is not changed to make offset folding still happen. - CodeGen/AMDGPU/loop-prefetch-data.ll: loop-reduce prefers to base addresses on the potentially OOB addresses used for prefetching for memory accesses, that might be a separate issue to look into. - Added memset tests to CodeGen/AMDGPU/memintrinsic-unroll.ll to make sure that offsets in the memset DAG lowering are still folded properly. A similar patch for GlobalISel will follow. Fixes SWDEV-516125.
1 parent cb87bed commit 4adc54a

File tree

8 files changed

+717
-94
lines changed

8 files changed

+717
-94
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,23 +1069,27 @@ class SelectionDAG {
10691069
SDValue EVL);
10701070

10711071
/// Returns sum of the base pointer and offset.
1072-
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
1072+
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
1073+
/// default.
10731074
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
10741075
const SDNodeFlags Flags = SDNodeFlags());
10751076
SDValue getMemBasePlusOffset(SDValue Base, SDValue Offset, const SDLoc &DL,
10761077
const SDNodeFlags Flags = SDNodeFlags());
10771078

10781079
/// Create an add instruction with appropriate flags when used for
10791080
/// addressing some offset of an object. i.e. if a load is split into multiple
1080-
/// components, create an add nuw from the base pointer to the offset.
1081+
/// components, create an add nuw inbounds from the base pointer to the
1082+
/// offset.
10811083
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
1082-
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
1084+
return getMemBasePlusOffset(
1085+
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
10831086
}
10841087

10851088
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
10861089
// The object itself can't wrap around the address space, so it shouldn't be
10871090
// possible for the adds of the offsets to the split parts to overflow.
1088-
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
1091+
return getMemBasePlusOffset(
1092+
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
10891093
}
10901094

10911095
/// Return a new CALLSEQ_START node, that starts new call frame, in which

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,9 +1213,12 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
12131213

12141214
if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) {
12151215
SDNodeFlags NewFlags;
1216-
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
1217-
Flags.hasNoUnsignedWrap())
1218-
NewFlags |= SDNodeFlags::NoUnsignedWrap;
1216+
if (N0.getOpcode() == ISD::ADD) {
1217+
if (N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap())
1218+
NewFlags |= SDNodeFlags::NoUnsignedWrap;
1219+
if (N0->getFlags().hasInBounds() && Flags.hasInBounds())
1220+
NewFlags |= SDNodeFlags::InBounds;
1221+
}
12191222

12201223
if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
12211224
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8279,7 +8279,7 @@ static SDValue getMemcpyLoadsAndStores(
82798279
if (Value.getNode()) {
82808280
Store = DAG.getStore(
82818281
Chain, dl, Value,
8282-
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
8282+
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
82838283
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
82848284
OutChains.push_back(Store);
82858285
}
@@ -8304,14 +8304,14 @@ static SDValue getMemcpyLoadsAndStores(
83048304

83058305
Value = DAG.getExtLoad(
83068306
ISD::EXTLOAD, dl, NVT, Chain,
8307-
DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
8307+
DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
83088308
SrcPtrInfo.getWithOffset(SrcOff), VT,
83098309
commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
83108310
OutLoadChains.push_back(Value.getValue(1));
83118311

83128312
Store = DAG.getTruncStore(
83138313
Chain, dl, Value,
8314-
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
8314+
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
83158315
DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
83168316
OutStoreChains.push_back(Store);
83178317
}
@@ -8448,7 +8448,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
84488448

84498449
Value = DAG.getLoad(
84508450
VT, dl, Chain,
8451-
DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
8451+
DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
84528452
SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
84538453
LoadValues.push_back(Value);
84548454
LoadChains.push_back(Value.getValue(1));
@@ -8463,7 +8463,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
84638463

84648464
Store = DAG.getStore(
84658465
Chain, dl, LoadValues[i],
8466-
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
8466+
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
84678467
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
84688468
OutChains.push_back(Store);
84698469
DstOff += VTSize;
@@ -8595,7 +8595,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
85958595
assert(Value.getValueType() == VT && "Value with wrong type.");
85968596
SDValue Store = DAG.getStore(
85978597
Chain, dl, Value,
8598-
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
8598+
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
85998599
DstPtrInfo.getWithOffset(DstOff), Alignment,
86008600
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
86018601
NewAAInfo);

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 75 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,72 +1744,82 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
17441744
isFlatScratchBaseLegal(Addr))) {
17451745
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
17461746

1747-
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1748-
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1749-
Addr = N0;
1750-
OffsetVal = COffsetVal;
1751-
} else {
1752-
// If the offset doesn't fit, put the low bits into the offset field and
1753-
// add the rest.
1754-
//
1755-
// For a FLAT instruction the hardware decides whether to access
1756-
// global/scratch/shared memory based on the high bits of vaddr,
1757-
// ignoring the offset field, so we have to ensure that when we add
1758-
// remainder to vaddr it still points into the same underlying object.
1759-
// The easiest way to do that is to make sure that we split the offset
1760-
// into two pieces that are both >= 0 or both <= 0.
1761-
1762-
SDLoc DL(N);
1763-
uint64_t RemainderOffset;
1764-
1765-
std::tie(OffsetVal, RemainderOffset) =
1766-
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1767-
1768-
SDValue AddOffsetLo =
1769-
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1770-
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1771-
1772-
if (Addr.getValueType().getSizeInBits() == 32) {
1773-
SmallVector<SDValue, 3> Opnds;
1774-
Opnds.push_back(N0);
1775-
Opnds.push_back(AddOffsetLo);
1776-
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1777-
if (Subtarget->hasAddNoCarry()) {
1778-
AddOp = AMDGPU::V_ADD_U32_e64;
1779-
Opnds.push_back(Clamp);
1780-
}
1781-
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1747+
// Adding the offset to the base address in a FLAT instruction must not
1748+
// change the memory aperture in which the address falls. Therefore we can
1749+
// only fold offsets from inbounds GEPs into FLAT instructions.
1750+
bool IsInBounds = Addr->getFlags().hasInBounds();
1751+
if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1752+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1753+
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1754+
Addr = N0;
1755+
OffsetVal = COffsetVal;
17821756
} else {
1783-
// TODO: Should this try to use a scalar add pseudo if the base address
1784-
// is uniform and saddr is usable?
1785-
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1786-
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1787-
1788-
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1789-
DL, MVT::i32, N0, Sub0);
1790-
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1791-
DL, MVT::i32, N0, Sub1);
1792-
1793-
SDValue AddOffsetHi =
1794-
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1795-
1796-
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1797-
1798-
SDNode *Add =
1799-
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1800-
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1801-
1802-
SDNode *Addc = CurDAG->getMachineNode(
1803-
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1804-
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1805-
1806-
SDValue RegSequenceArgs[] = {
1807-
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1808-
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1809-
1810-
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1811-
MVT::i64, RegSequenceArgs),
1812-
0);
1757+
// If the offset doesn't fit, put the low bits into the offset field
1758+
// and add the rest.
1759+
//
1760+
// For a FLAT instruction the hardware decides whether to access
1761+
// global/scratch/shared memory based on the high bits of vaddr,
1762+
// ignoring the offset field, so we have to ensure that when we add
1763+
// remainder to vaddr it still points into the same underlying object.
1764+
// The easiest way to do that is to make sure that we split the offset
1765+
// into two pieces that are both >= 0 or both <= 0.
1766+
1767+
SDLoc DL(N);
1768+
uint64_t RemainderOffset;
1769+
1770+
std::tie(OffsetVal, RemainderOffset) =
1771+
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1772+
1773+
SDValue AddOffsetLo =
1774+
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1775+
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1776+
1777+
if (Addr.getValueType().getSizeInBits() == 32) {
1778+
SmallVector<SDValue, 3> Opnds;
1779+
Opnds.push_back(N0);
1780+
Opnds.push_back(AddOffsetLo);
1781+
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1782+
if (Subtarget->hasAddNoCarry()) {
1783+
AddOp = AMDGPU::V_ADD_U32_e64;
1784+
Opnds.push_back(Clamp);
1785+
}
1786+
Addr =
1787+
SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1788+
} else {
1789+
// TODO: Should this try to use a scalar add pseudo if the base
1790+
// address is uniform and saddr is usable?
1791+
SDValue Sub0 =
1792+
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1793+
SDValue Sub1 =
1794+
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1795+
1796+
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1797+
DL, MVT::i32, N0, Sub0);
1798+
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1799+
DL, MVT::i32, N0, Sub1);
1800+
1801+
SDValue AddOffsetHi =
1802+
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1803+
1804+
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1805+
1806+
SDNode *Add =
1807+
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1808+
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1809+
1810+
SDNode *Addc = CurDAG->getMachineNode(
1811+
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1812+
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1813+
1814+
SDValue RegSequenceArgs[] = {
1815+
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1816+
MVT::i32),
1817+
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1818+
1819+
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1820+
MVT::i64, RegSequenceArgs),
1821+
0);
1822+
}
18131823
}
18141824
}
18151825
}

0 commit comments

Comments
 (0)