Skip to content

[SeparateConstOffsetFromGEP] Reland: Reorder trivial GEP chains to separate constants #81671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 70 additions & 3 deletions llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,11 @@ class SeparateConstOffsetFromGEP {
/// and returns true if the splitting succeeds.
bool splitGEP(GetElementPtrInst *GEP);

/// Tries to reorder the given GEP with the GEP that produces the base if
/// doing so results in producing a constant offset as the outermost
/// index.
bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);

/// Lower a GEP with multiple indices into multiple GEPs with a single index.
/// Function splitGEP already split the original GEP into a variadic part and
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
Expand Down Expand Up @@ -964,6 +969,66 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
Variadic->eraseFromParent();
}

bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
TargetTransformInfo &TTI) {
Type *GEPType = GEP->getResultElementType();
// TODO: support reordering for non-trivial GEP chains
if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
return false;

auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
if (!PtrGEP)
return false;
Type *PtrGEPType = PtrGEP->getResultElementType();
// TODO: support reordering for non-trivial GEP chains
if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
return false;

// TODO: support reordering for non-trivial GEP chains
if (PtrGEPType != GEPType ||
PtrGEP->getSourceElementType() != GEP->getSourceElementType())
return false;

bool NestedNeedsExtraction;
int64_t NestedByteOffset =
accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
if (!NestedNeedsExtraction)
return false;

unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
/*BaseGV=*/nullptr, NestedByteOffset,
/*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
return false;

IRBuilder<> Builder(GEP);
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
bool GEPInBounds = GEP->isInBounds();
bool PtrGEPInBounds = PtrGEP->isInBounds();
bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
if (IsChainInBounds) {
auto GEPIdx = GEP->indices().begin();
auto KnownGEPIdx = computeKnownBits(GEPIdx->get(), *DL);
IsChainInBounds &= KnownGEPIdx.isNonNegative();
if (IsChainInBounds) {
auto PtrGEPIdx = GEP->indices().begin();
auto KnownPtrGEPIdx = computeKnownBits(PtrGEPIdx->get(), *DL);
IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
}
}

// For trivial GEP chains, we can swap the indicies.
auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
SmallVector<Value *, 4>(GEP->indices()));
cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
SmallVector<Value *, 4>(PtrGEP->indices()));
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
GEP->replaceAllUsesWith(NewGEP);
RecursivelyDeleteTriviallyDeadInstructions(GEP);
return true;
}

bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// Skip vector GEPs.
if (GEP->getType()->isVectorTy())
Expand All @@ -979,11 +1044,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
bool NeedsExtraction;
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);

if (!NeedsExtraction)
return Changed;

TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());

if (!NeedsExtraction) {
Changed |= reorderGEP(GEP, TTI);
return Changed;
}

// If LowerGEP is disabled, before really splitting the GEP, check whether the
// backend supports the addressing mode we are about to produce. If no, this
// splitting probably won't be beneficial.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_16: ; %Flow43
; CHECK-NEXT: .LBB0_16: ; %Flow45
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: v_mov_b32_e32 v57, v0
; CHECK-NEXT: .LBB0_17: ; %Flow44
; CHECK-NEXT: .LBB0_17: ; %Flow46
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: s_mov_b32 s49, exec_lo
Expand Down Expand Up @@ -323,11 +323,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_branch .LBB0_19
; CHECK-NEXT: .LBB0_22: ; %Flow41
; CHECK-NEXT: .LBB0_22: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: .LBB0_23: ; %Flow42
; CHECK-NEXT: .LBB0_23: ; %Flow44
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
Expand All @@ -340,7 +340,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_or_b32 s43, s4, s43
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s43
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
; CHECK-NEXT: .LBB0_25: ; %Flow49
; CHECK-NEXT: .LBB0_25: ; %Flow51
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
Expand All @@ -362,12 +362,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41
; CHECK-NEXT: s_cbranch_execz .LBB0_33
; CHECK-NEXT: ; %bb.26:
; CHECK-NEXT: s_add_u32 s42, s44, 8
; CHECK-NEXT: s_addc_u32 s43, s45, 0
; CHECK-NEXT: s_mov_b32 s44, 0
; CHECK-NEXT: s_mov_b32 s42, 0
; CHECK-NEXT: s_branch .LBB0_28
; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s34, 40
Expand All @@ -383,12 +381,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: s_cbranch_execz .LBB0_33
; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41
; CHECK-NEXT: s_mov_b32 s45, exec_lo
; CHECK-NEXT: s_mov_b32 s43, exec_lo
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
Expand All @@ -397,15 +395,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
; CHECK-NEXT: v_add_co_u32 v2, s4, s42, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s43, 0, s4
; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
Expand All @@ -417,8 +415,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_cbranch_execz .LBB0_27
; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
Expand Down
Loading