Skip to content

Commit 190110d

Browse files
jrbyrnesbcahoon
authored andcommitted
[SeparateConstOffsetFromGEP] Reland: Reorder trivial GEP chains to separate constants (llvm#81671)
Actually update tests w.r.t llvm@9e5a77f and reland llvm#73056 Change-Id: Ic9bbad513c6056b8c56a856015d808e01be81004
1 parent f2f33aa commit 190110d

File tree

8 files changed

+683
-158
lines changed

8 files changed

+683
-158
lines changed

llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,11 @@ class SeparateConstOffsetFromGEP {
393393
/// and returns true if the splitting succeeds.
394394
bool splitGEP(GetElementPtrInst *GEP);
395395

396+
/// Tries to reorder the given GEP with the GEP that produces the base if
397+
/// doing so results in producing a constant offset as the outermost
398+
/// index.
399+
bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
400+
396401
/// Lower a GEP with multiple indices into multiple GEPs with a single index.
397402
/// Function splitGEP already split the original GEP into a variadic part and
398403
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -970,6 +975,66 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
970975
Variadic->eraseFromParent();
971976
}
972977

978+
bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
979+
TargetTransformInfo &TTI) {
980+
Type *GEPType = GEP->getResultElementType();
981+
// TODO: support reordering for non-trivial GEP chains
982+
if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
983+
return false;
984+
985+
auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
986+
if (!PtrGEP)
987+
return false;
988+
Type *PtrGEPType = PtrGEP->getResultElementType();
989+
// TODO: support reordering for non-trivial GEP chains
990+
if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
991+
return false;
992+
993+
// TODO: support reordering for non-trivial GEP chains
994+
if (PtrGEPType != GEPType ||
995+
PtrGEP->getSourceElementType() != GEP->getSourceElementType())
996+
return false;
997+
998+
bool NestedNeedsExtraction;
999+
int64_t NestedByteOffset =
1000+
accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
1001+
if (!NestedNeedsExtraction)
1002+
return false;
1003+
1004+
unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
1005+
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
1006+
/*BaseGV=*/nullptr, NestedByteOffset,
1007+
/*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
1008+
return false;
1009+
1010+
IRBuilder<> Builder(GEP);
1011+
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
1012+
bool GEPInBounds = GEP->isInBounds();
1013+
bool PtrGEPInBounds = PtrGEP->isInBounds();
1014+
bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
1015+
if (IsChainInBounds) {
1016+
auto GEPIdx = GEP->indices().begin();
1017+
auto KnownGEPIdx = computeKnownBits(GEPIdx->get(), *DL);
1018+
IsChainInBounds &= KnownGEPIdx.isNonNegative();
1019+
if (IsChainInBounds) {
1020+
auto PtrGEPIdx = GEP->indices().begin();
1021+
auto KnownPtrGEPIdx = computeKnownBits(PtrGEPIdx->get(), *DL);
1022+
IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
1023+
}
1024+
}
1025+
1026+
// For trivial GEP chains, we can swap the indicies.
1027+
auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
1028+
SmallVector<Value *, 4>(GEP->indices()));
1029+
cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
1030+
auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
1031+
SmallVector<Value *, 4>(PtrGEP->indices()));
1032+
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
1033+
GEP->replaceAllUsesWith(NewGEP);
1034+
RecursivelyDeleteTriviallyDeadInstructions(GEP);
1035+
return true;
1036+
}
1037+
9731038
bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9741039
// Skip vector GEPs.
9751040
if (GEP->getType()->isVectorTy())
@@ -985,11 +1050,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9851050
bool NeedsExtraction;
9861051
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
9871052

988-
if (!NeedsExtraction)
989-
return Changed;
990-
9911053
TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
9921054

1055+
if (!NeedsExtraction) {
1056+
Changed |= reorderGEP(GEP, TTI);
1057+
return Changed;
1058+
}
1059+
9931060
// If LowerGEP is disabled, before really splitting the GEP, check whether the
9941061
// backend supports the addressing mode we are about to produce. If no, this
9951062
// splitting probably won't be beneficial.

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
143143
; CHECK-NEXT: v_mov_b32_e32 v47, 0
144144
; CHECK-NEXT: s_mov_b32 s49, 0
145145
; CHECK-NEXT: s_branch .LBB0_7
146-
; CHECK-NEXT: .LBB0_5: ; %Flow41
146+
; CHECK-NEXT: .LBB0_5: ; %Flow43
147147
; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
148148
; CHECK-NEXT: s_inst_prefetch 0x2
149149
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
150-
; CHECK-NEXT: .LBB0_6: ; %Flow42
150+
; CHECK-NEXT: .LBB0_6: ; %Flow44
151151
; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
152152
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45
153153
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
@@ -304,7 +304,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
304304
; CHECK-NEXT: ds_write_b32 v0, v58
305305
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
306306
; CHECK-NEXT: s_branch .LBB0_9
307-
; CHECK-NEXT: .LBB0_18: ; %Flow43
307+
; CHECK-NEXT: .LBB0_18: ; %Flow45
308308
; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
309309
; CHECK-NEXT: v_mov_b32_e32 v57, v0
310310
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
@@ -357,7 +357,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
357357
; CHECK-NEXT: ds_write_b32 v0, v57
358358
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
359359
; CHECK-NEXT: s_branch .LBB0_21
360-
; CHECK-NEXT: .LBB0_24: ; %Flow47
360+
; CHECK-NEXT: .LBB0_24: ; %Flow49
361361
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42
362362
; CHECK-NEXT: .LBB0_25:
363363
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@@ -382,13 +382,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
382382
; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo
383383
; CHECK-NEXT: s_cbranch_scc0 .LBB0_34
384384
; CHECK-NEXT: ; %bb.26:
385-
; CHECK-NEXT: s_add_u32 s42, s44, 8
386-
; CHECK-NEXT: s_addc_u32 s43, s45, 0
387-
; CHECK-NEXT: s_mov_b32 s44, 0
385+
; CHECK-NEXT: s_mov_b32 s42, 0
388386
; CHECK-NEXT: s_branch .LBB0_29
389-
; CHECK-NEXT: .LBB0_27: ; %Flow38
387+
; CHECK-NEXT: .LBB0_27: ; %Flow40
390388
; CHECK-NEXT: ; in Loop: Header=BB0_29 Depth=1
391-
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45
389+
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43
392390
; CHECK-NEXT: .LBB0_28: ; in Loop: Header=BB0_29 Depth=1
393391
; CHECK-NEXT: v_mov_b32_e32 v31, v40
394392
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -405,13 +403,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
405403
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
406404
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
407405
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
408-
; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44
409-
; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s44
410-
; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s44
406+
; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42
407+
; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s42
408+
; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s42
411409
; CHECK-NEXT: s_cbranch_scc0 .LBB0_34
412410
; CHECK-NEXT: .LBB0_29: ; =>This Inner Loop Header: Depth=1
413411
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41
414-
; CHECK-NEXT: s_mov_b32 s45, exec_lo
412+
; CHECK-NEXT: s_mov_b32 s43, exec_lo
415413
; CHECK-NEXT: ds_read_b32 v0, v0
416414
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
417415
; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
@@ -420,15 +418,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
420418
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
421419
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
422420
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
423-
; CHECK-NEXT: v_add_co_u32 v2, s4, s42, v1
424-
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s43, 0, s4
421+
; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1
422+
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4
425423
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
426424
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
427425
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
428426
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
429427
; CHECK-NEXT: s_clause 0x1
430-
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
431-
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
428+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
429+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
432430
; CHECK-NEXT: s_waitcnt vmcnt(0)
433431
; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
434432
; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
@@ -442,8 +440,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
442440
; CHECK-NEXT: s_cbranch_scc0 .LBB0_28
443441
; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_29 Depth=1
444442
; CHECK-NEXT: s_clause 0x1
445-
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
446-
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
443+
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
444+
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
447445
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
448446
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
449447
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45

0 commit comments

Comments
 (0)