llvm · AlexMaclean · Jun 6, 2025 · Jun 6, 2025 · Jun 10, 2025 · nikic
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1048,19 +1048,31 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   if (GEP->getType()->isVectorTy())
     return false;
 
+  // If the base of this GEP is a ptradd of a constant, lets pass the constant
+  // along. This ensures that when we have a chain of GEPs the constant
+  // offset from each is accumulated.
+  Value *NewBase;
+  const APInt *BaseOffset;
+  const bool ExtractBase =
+      match(GEP->getPointerOperand(),
+            m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset)));
+
+  const int64_t BaseByteOffset = ExtractBase ? BaseOffset->getSExtValue() : 0;
+
   // The backend can already nicely handle the case where all indices are
   // constant.
-  if (GEP->hasAllConstantIndices())
+  if (GEP->hasAllConstantIndices() && !ExtractBase)
     return false;
 
   bool Changed = canonicalizeArrayIndicesToIndexSize(GEP);
 
   bool NeedsExtraction;
-  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+  int64_t AccumulativeByteOffset =
+      BaseByteOffset + accumulateByteOffset(GEP, NeedsExtraction);
 
   TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
 
-  if (!NeedsExtraction) {
+  if (!NeedsExtraction && !ExtractBase) {
     Changed |= reorderGEP(GEP, TTI);
     return Changed;
   }
@@ -1084,7 +1096,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 
   // Track information for preserving GEP flags.
   bool AllOffsetsNonNegative = AccumulativeByteOffset >= 0;
-  bool AllNUWPreserved = true;
+  bool AllNUWPreserved = GEP->hasNoUnsignedWrap();
+  bool NewGEPInBounds = GEP->isInBounds();
+  bool NewGEPNUSW = GEP->hasNoUnsignedSignedWrap();
 
   // Remove the constant offset in each sequential index. The resultant GEP
   // computes the variadic base.
@@ -1120,6 +1134,16 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
       }
     }
   }
+  if (ExtractBase) {
+    GEPOperator *Base = cast<GEPOperator>(GEP->getPointerOperand());
+    AllNUWPreserved &= Base->hasNoUnsignedWrap();
+    NewGEPInBounds &= Base->isInBounds();
+    NewGEPNUSW &= Base->hasNoUnsignedSignedWrap();
+    AllOffsetsNonNegative &= BaseByteOffset >= 0;
+
+    GEP->setOperand(0, NewBase);
+    RecursivelyDeleteTriviallyDeadInstructions(Base);
+  }
 
   // Clear the inbounds attribute because the new index may be off-bound.
   // e.g.,
@@ -1147,21 +1171,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 
   // If the initial GEP was NUW and all operations that we reassociate were NUW
   // additions, the resulting GEPs are also NUW.
-  if (GEP->hasNoUnsignedWrap() && AllNUWPreserved) {
+  if (AllNUWPreserved) {
     NewGEPFlags |= GEPNoWrapFlags::noUnsignedWrap();
     // If the initial GEP additionally had NUSW (or inbounds, which implies
     // NUSW), we know that the indices in the initial GEP must all have their
     // signbit not set. For indices that are the result of NUW adds, the
     // add-operands therefore also don't have their signbit set. Therefore, all
     // indices of the resulting GEPs are non-negative -> we can preserve
     // the inbounds/nusw flag.
-    CanPreserveInBoundsNUSW |= GEP->hasNoUnsignedSignedWrap();
+    CanPreserveInBoundsNUSW |= NewGEPNUSW;
   }
 
   if (CanPreserveInBoundsNUSW) {
-    if (GEP->isInBounds())
+    if (NewGEPInBounds)
       NewGEPFlags |= GEPNoWrapFlags::inBounds();
-    else if (GEP->hasNoUnsignedSignedWrap())
+    else if (NewGEPNUSW)
       NewGEPFlags |= GEPNoWrapFlags::noUnsignedSignedWrap();
   }
 
@@ -1242,11 +1266,13 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
-  for (BasicBlock &B : F) {
-    if (!DT->isReachableFromEntry(&B))
+
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *B : RPOT) {
+    if (!DT->isReachableFromEntry(B))
       continue;
 
-    for (Instruction &I : llvm::make_early_inc_range(B))
+    for (Instruction &I : llvm::make_early_inc_range(*B))
       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I))
         Changed |= splitGEP(GEP);
     // No need to split GEP ConstantExprs because all its indices are constant

diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -279,11 +279,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v58
 ; CHECK-NEXT:    s_branch .LBB0_7
-; CHECK-NEXT:  .LBB0_16: ; %Flow45
+; CHECK-NEXT:  .LBB0_16: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s69
 ; CHECK-NEXT:    v_mov_b32_e32 v57, v0
-; CHECK-NEXT:  .LBB0_17: ; %Flow46
+; CHECK-NEXT:  .LBB0_17: ; %Flow44
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s68
 ; CHECK-NEXT:    s_mov_b32 s55, exec_lo
@@ -330,11 +330,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CHECK-NEXT:    ds_write_b32 v0, v57
 ; CHECK-NEXT:    s_branch .LBB0_19
-; CHECK-NEXT:  .LBB0_22: ; %Flow43
+; CHECK-NEXT:  .LBB0_22: ; %Flow41
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s68
-; CHECK-NEXT:  .LBB0_23: ; %Flow44
+; CHECK-NEXT:  .LBB0_23: ; %Flow42
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:  ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
@@ -347,7 +347,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_or_b32 s53, s4, s53
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_5
-; CHECK-NEXT:  .LBB0_25: ; %Flow51
+; CHECK-NEXT:  .LBB0_25: ; %Flow49
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1