Skip to content

Commit 9683c1e

Browse files
committed
[SepGEP] Reorder trivial GEP chains to separate constants
Change-Id: I23f9652b4d22a467725ad2b65df338046b1e5522
1 parent 237adfc commit 9683c1e

File tree

8 files changed

+684
-159
lines changed

8 files changed

+684
-159
lines changed

llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,11 @@ class SeparateConstOffsetFromGEP {
393393
/// and returns true if the splitting succeeds.
394394
bool splitGEP(GetElementPtrInst *GEP);
395395

396+
/// Tries to reorder the given GEP with the GEP that produces the base if
397+
/// doing so results in producing a constant offset as the outermost
398+
/// index.
399+
bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
400+
396401
/// Lower a GEP with multiple indices into multiple GEPs with a single index.
397402
/// Function splitGEP already split the original GEP into a variadic part and
398403
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -972,6 +977,66 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
972977
Variadic->eraseFromParent();
973978
}
974979

980+
bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
981+
TargetTransformInfo &TTI) {
982+
Type *GEPType = GEP->getResultElementType();
983+
// TODO: support reordering for non-trivial GEP chains
984+
if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
985+
return false;
986+
987+
auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
988+
if (!PtrGEP)
989+
return false;
990+
Type *PtrGEPType = PtrGEP->getResultElementType();
991+
// TODO: support reordering for non-trivial GEP chains
992+
if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
993+
return false;
994+
995+
// TODO: support reordering for non-trivial GEP chains
996+
if (PtrGEPType != GEPType ||
997+
PtrGEP->getSourceElementType() != GEP->getSourceElementType())
998+
return false;
999+
1000+
bool NestedNeedsExtraction;
1001+
int64_t NestedByteOffset =
1002+
accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
1003+
if (!NestedNeedsExtraction)
1004+
return false;
1005+
1006+
unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
1007+
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
1008+
/*BaseGV=*/nullptr, NestedByteOffset,
1009+
/*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
1010+
return false;
1011+
1012+
IRBuilder<> Builder(GEP);
1013+
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
1014+
bool GEPInBounds = GEP->isInBounds();
1015+
bool PtrGEPInBounds = PtrGEP->isInBounds();
1016+
bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
1017+
if (IsChainInBounds) {
1018+
auto GEPIdx = GEP->indices().begin();
1019+
auto KnownGEPIdx = computeKnownBits(GEPIdx->get(), *DL);
1020+
IsChainInBounds &= KnownGEPIdx.isNonNegative();
1021+
if (IsChainInBounds) {
1022+
auto PtrGEPIdx = GEP->indices().begin();
1023+
auto KnownPtrGEPIdx = computeKnownBits(PtrGEPIdx->get(), *DL);
1024+
IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
1025+
}
1026+
}
1027+
1028+
// For trivial GEP chains, we can swap the indicies.
1029+
auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
1030+
SmallVector<Value *, 4>(GEP->indices()));
1031+
cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
1032+
auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
1033+
SmallVector<Value *, 4>(PtrGEP->indices()));
1034+
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
1035+
GEP->replaceAllUsesWith(NewGEP);
1036+
RecursivelyDeleteTriviallyDeadInstructions(GEP);
1037+
return true;
1038+
}
1039+
9751040
bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9761041
// Skip vector GEPs.
9771042
if (GEP->getType()->isVectorTy())
@@ -987,11 +1052,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9871052
bool NeedsExtraction;
9881053
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
9891054

990-
if (!NeedsExtraction)
991-
return Changed;
992-
9931055
TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
9941056

1057+
if (!NeedsExtraction) {
1058+
Changed |= reorderGEP(GEP, TTI);
1059+
return Changed;
1060+
}
1061+
9951062
// If LowerGEP is disabled, before really splitting the GEP, check whether the
9961063
// backend supports the addressing mode we are about to produce. If no, this
9971064
// splitting probably won't be beneficial.

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -264,11 +264,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
264264
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
265265
; CHECK-NEXT: ds_write_b32 v0, v58
266266
; CHECK-NEXT: s_branch .LBB0_7
267-
; CHECK-NEXT: .LBB0_16: ; %Flow43
267+
; CHECK-NEXT: .LBB0_16: ; %Flow45
268268
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
269269
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
270270
; CHECK-NEXT: v_mov_b32_e32 v57, v0
271-
; CHECK-NEXT: .LBB0_17: ; %Flow44
271+
; CHECK-NEXT: .LBB0_17: ; %Flow46
272272
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
273273
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
274274
; CHECK-NEXT: s_mov_b32 s55, exec_lo
@@ -311,11 +311,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
311311
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
312312
; CHECK-NEXT: ds_write_b32 v0, v57
313313
; CHECK-NEXT: s_branch .LBB0_19
314-
; CHECK-NEXT: .LBB0_22: ; %Flow41
314+
; CHECK-NEXT: .LBB0_22: ; %Flow43
315315
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
316316
; CHECK-NEXT: s_inst_prefetch 0x2
317317
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
318-
; CHECK-NEXT: .LBB0_23: ; %Flow42
318+
; CHECK-NEXT: .LBB0_23: ; %Flow44
319319
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
320320
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
321321
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
@@ -328,7 +328,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
328328
; CHECK-NEXT: s_or_b32 s49, s4, s49
329329
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
330330
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
331-
; CHECK-NEXT: .LBB0_25: ; %Flow49
331+
; CHECK-NEXT: .LBB0_25: ; %Flow51
332332
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
333333
; CHECK-NEXT: v_mov_b32_e32 v31, v41
334334
; CHECK-NEXT: v_mov_b32_e32 v0, 1
@@ -347,18 +347,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
347347
; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v40
348348
; CHECK-NEXT: s_cbranch_execz .LBB0_33
349349
; CHECK-NEXT: ; %bb.26:
350-
; CHECK-NEXT: s_add_u32 s52, s44, 8
351-
; CHECK-NEXT: s_addc_u32 s53, s45, 0
352350
; CHECK-NEXT: s_getpc_b64 s[42:43]
353351
; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_addPU3AS1Vjj@rel32@lo+4
354352
; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_addPU3AS1Vjj@rel32@hi+12
355353
; CHECK-NEXT: s_mov_b32 s54, 0
356-
; CHECK-NEXT: s_getpc_b64 s[44:45]
357-
; CHECK-NEXT: s_add_u32 s44, s44, _Z10atomic_subPU3AS1Vjj@rel32@lo+4
358-
; CHECK-NEXT: s_addc_u32 s45, s45, _Z10atomic_subPU3AS1Vjj@rel32@hi+12
359354
; CHECK-NEXT: s_getpc_b64 s[48:49]
360-
; CHECK-NEXT: s_add_u32 s48, s48, _Z14get_local_sizej@rel32@lo+4
361-
; CHECK-NEXT: s_addc_u32 s49, s49, _Z14get_local_sizej@rel32@hi+12
355+
; CHECK-NEXT: s_add_u32 s48, s48, _Z10atomic_subPU3AS1Vjj@rel32@lo+4
356+
; CHECK-NEXT: s_addc_u32 s49, s49, _Z10atomic_subPU3AS1Vjj@rel32@hi+12
357+
; CHECK-NEXT: s_getpc_b64 s[52:53]
358+
; CHECK-NEXT: s_add_u32 s52, s52, _Z14get_local_sizej@rel32@lo+4
359+
; CHECK-NEXT: s_addc_u32 s53, s53, _Z14get_local_sizej@rel32@hi+12
362360
; CHECK-NEXT: s_branch .LBB0_28
363361
; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
364362
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
@@ -371,7 +369,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
371369
; CHECK-NEXT: s_mov_b32 s12, s41
372370
; CHECK-NEXT: s_mov_b32 s13, s40
373371
; CHECK-NEXT: s_mov_b32 s14, s33
374-
; CHECK-NEXT: s_swappc_b64 s[30:31], s[48:49]
372+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53]
375373
; CHECK-NEXT: v_add_co_u32 v40, vcc_lo, v0, v40
376374
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v40
377375
; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54
@@ -388,15 +386,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
388386
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
389387
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
390388
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
391-
; CHECK-NEXT: v_add_co_u32 v2, s4, s52, v1
392-
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s53, 0, s4
389+
; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1
390+
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4
393391
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
394392
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
395393
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
396394
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
397395
; CHECK-NEXT: s_clause 0x1
398-
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
399-
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
396+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
397+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
400398
; CHECK-NEXT: s_waitcnt vmcnt(0)
401399
; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
402400
; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
@@ -408,8 +406,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
408406
; CHECK-NEXT: s_cbranch_execz .LBB0_27
409407
; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
410408
; CHECK-NEXT: s_clause 0x1
411-
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
412-
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
409+
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
410+
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
413411
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
414412
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
415413
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
@@ -484,7 +482,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
484482
; CHECK-NEXT: s_mov_b32 s12, s41
485483
; CHECK-NEXT: s_mov_b32 s13, s40
486484
; CHECK-NEXT: s_mov_b32 s14, s33
487-
; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
485+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[48:49]
488486
; CHECK-NEXT: s_branch .LBB0_27
489487
; CHECK-NEXT: .LBB0_33:
490488
; CHECK-NEXT: s_endpgm

0 commit comments

Comments
 (0)