Skip to content

[SeparateConstOffsetFromGEP] propagate const offset through GEP chains #143470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 37 additions & 11 deletions llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,19 +1048,31 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (GEP->getType()->isVectorTy())
return false;

// If the base of this GEP is a ptradd of a constant, lets pass the constant
// along. This ensures that when we have a chain of GEPs the constant
// offset from each is accumulated.
Value *NewBase;
const APInt *BaseOffset;
const bool ExtractBase =
match(GEP->getPointerOperand(),
m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset)));

const int64_t BaseByteOffset = ExtractBase ? BaseOffset->getSExtValue() : 0;

// The backend can already nicely handle the case where all indices are
// constant.
if (GEP->hasAllConstantIndices())
if (GEP->hasAllConstantIndices() && !ExtractBase)
return false;

bool Changed = canonicalizeArrayIndicesToIndexSize(GEP);

bool NeedsExtraction;
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
int64_t AccumulativeByteOffset =
BaseByteOffset + accumulateByteOffset(GEP, NeedsExtraction);

TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());

if (!NeedsExtraction) {
if (!NeedsExtraction && !ExtractBase) {
Changed |= reorderGEP(GEP, TTI);
return Changed;
}
Expand All @@ -1084,7 +1096,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {

// Track information for preserving GEP flags.
bool AllOffsetsNonNegative = AccumulativeByteOffset >= 0;
bool AllNUWPreserved = true;
bool AllNUWPreserved = GEP->hasNoUnsignedWrap();
bool NewGEPInBounds = GEP->isInBounds();
bool NewGEPNUSW = GEP->hasNoUnsignedSignedWrap();

// Remove the constant offset in each sequential index. The resultant GEP
// computes the variadic base.
Expand Down Expand Up @@ -1120,6 +1134,16 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
}
}
}
if (ExtractBase) {
GEPOperator *Base = cast<GEPOperator>(GEP->getPointerOperand());
AllNUWPreserved &= Base->hasNoUnsignedWrap();
NewGEPInBounds &= Base->isInBounds();
NewGEPNUSW &= Base->hasNoUnsignedSignedWrap();
AllOffsetsNonNegative &= BaseByteOffset >= 0;

GEP->setOperand(0, NewBase);
RecursivelyDeleteTriviallyDeadInstructions(Base);
}

// Clear the inbounds attribute because the new index may be off-bound.
// e.g.,
Expand Down Expand Up @@ -1147,21 +1171,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {

// If the initial GEP was NUW and all operations that we reassociate were NUW
// additions, the resulting GEPs are also NUW.
if (GEP->hasNoUnsignedWrap() && AllNUWPreserved) {
if (AllNUWPreserved) {
NewGEPFlags |= GEPNoWrapFlags::noUnsignedWrap();
// If the initial GEP additionally had NUSW (or inbounds, which implies
// NUSW), we know that the indices in the initial GEP must all have their
// signbit not set. For indices that are the result of NUW adds, the
// add-operands therefore also don't have their signbit set. Therefore, all
// indices of the resulting GEPs are non-negative -> we can preserve
// the inbounds/nusw flag.
CanPreserveInBoundsNUSW |= GEP->hasNoUnsignedSignedWrap();
CanPreserveInBoundsNUSW |= NewGEPNUSW;
}

if (CanPreserveInBoundsNUSW) {
if (GEP->isInBounds())
if (NewGEPInBounds)
NewGEPFlags |= GEPNoWrapFlags::inBounds();
else if (GEP->hasNoUnsignedSignedWrap())
else if (NewGEPNUSW)
NewGEPFlags |= GEPNoWrapFlags::noUnsignedSignedWrap();
}

Expand Down Expand Up @@ -1242,11 +1266,13 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {

DL = &F.getDataLayout();
bool Changed = false;
for (BasicBlock &B : F) {
if (!DT->isReachableFromEntry(&B))

ReversePostOrderTraversal<Function *> RPOT(&F);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is this related to the rest of the patch?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intent is to ensure that when there is a chain of GEPs, we always separate constant offsets in order. This way the logic to pick up the constant offset from the first GEP and pass it along through the second always works as expected. My understanding is that a standard iteration doesn't have very strong guarantees about the ordering of blocks and a RPOT would ensure GEPs are transformed in order.

for (BasicBlock *B : RPOT) {
if (!DT->isReachableFromEntry(B))
continue;

for (Instruction &I : llvm::make_early_inc_range(B))
for (Instruction &I : llvm::make_early_inc_range(*B))
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I))
Changed |= splitGEP(GEP);
// No need to split GEP ConstantExprs because all its indices are constant
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_16: ; %Flow45
; CHECK-NEXT: .LBB0_16: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69
; CHECK-NEXT: v_mov_b32_e32 v57, v0
; CHECK-NEXT: .LBB0_17: ; %Flow46
; CHECK-NEXT: .LBB0_17: ; %Flow44
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: s_mov_b32 s55, exec_lo
Expand Down Expand Up @@ -330,11 +330,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_branch .LBB0_19
; CHECK-NEXT: .LBB0_22: ; %Flow43
; CHECK-NEXT: .LBB0_22: ; %Flow41
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: .LBB0_23: ; %Flow44
; CHECK-NEXT: .LBB0_23: ; %Flow42
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
Expand All @@ -347,7 +347,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_or_b32 s53, s4, s53
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
; CHECK-NEXT: .LBB0_25: ; %Flow51
; CHECK-NEXT: .LBB0_25: ; %Flow49
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
Expand Down
Loading