ROCm
diff --git a/‎llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Lines changed: 70 additions & 3 deletions b/‎llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Lines changed: 70 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
Lines changed: 17 additions & 19 deletions b/‎llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
Lines changed: 17 additions & 19 deletions
@@ -393,6 +393,11 @@ class SeparateConstOffsetFromGEP {
   /// and returns true if the splitting succeeds.
   bool splitGEP(GetElementPtrInst *GEP);
 
+  /// Tries to reorder the given GEP with the GEP that produces the base if
+  /// doing so results in producing a constant offset as the outermost
+  /// index.
+  bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
+
   /// Lower a GEP with multiple indices into multiple GEPs with a single index.
   /// Function splitGEP already split the original GEP into a variadic part and
   /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -970,6 +975,66 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
   Variadic->eraseFromParent();
 }
 
+bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
+                                            TargetTransformInfo &TTI) {
+  Type *GEPType = GEP->getResultElementType();
+  // TODO: support reordering for non-trivial GEP chains
+  if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
+    return false;
+
+  auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
+  if (!PtrGEP)
+    return false;
+  Type *PtrGEPType = PtrGEP->getResultElementType();
+  // TODO: support reordering for non-trivial GEP chains
+  if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
+    return false;
+
+  // TODO: support reordering for non-trivial GEP chains
+  if (PtrGEPType != GEPType ||
+      PtrGEP->getSourceElementType() != GEP->getSourceElementType())
+    return false;
+
+  bool NestedNeedsExtraction;
+  int64_t NestedByteOffset =
+      accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
+  if (!NestedNeedsExtraction)
+    return false;
+
+  unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
+  if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
+                                 /*BaseGV=*/nullptr, NestedByteOffset,
+                                 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
+    return false;
+
+  IRBuilder<> Builder(GEP);
+  Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
+  bool GEPInBounds = GEP->isInBounds();
+  bool PtrGEPInBounds = PtrGEP->isInBounds();
+  bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
+  if (IsChainInBounds) {
+    auto GEPIdx = GEP->indices().begin();
+    auto KnownGEPIdx = computeKnownBits(GEPIdx->get(), *DL);
+    IsChainInBounds &= KnownGEPIdx.isNonNegative();
+    if (IsChainInBounds) {
+      auto PtrGEPIdx = GEP->indices().begin();
+      auto KnownPtrGEPIdx = computeKnownBits(PtrGEPIdx->get(), *DL);
+      IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
+    }
+  }
+
+  // For trivial GEP chains, we can swap the indicies.
+  auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
+                                  SmallVector<Value *, 4>(GEP->indices()));
+  cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
+  auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
+                                  SmallVector<Value *, 4>(PtrGEP->indices()));
+  cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
+  GEP->replaceAllUsesWith(NewGEP);
+  RecursivelyDeleteTriviallyDeadInstructions(GEP);
+  return true;
+}
+
 bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // Skip vector GEPs.
   if (GEP->getType()->isVectorTy())
@@ -985,11 +1050,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   bool NeedsExtraction;
   int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
 
-  if (!NeedsExtraction)
-    return Changed;
-
   TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
 
+  if (!NeedsExtraction) {
+    Changed |= reorderGEP(GEP, TTI);
+    return Changed;
+  }
+
   // If LowerGEP is disabled, before really splitting the GEP, check whether the
   // backend supports the addressing mode we are about to produce. If no, this
   // splitting probably won't be beneficial.
 
@@ -143,11 +143,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_mov_b32_e32 v47, 0
 ; CHECK-NEXT:    s_mov_b32 s49, 0
 ; CHECK-NEXT:    s_branch .LBB0_7
-; CHECK-NEXT:  .LBB0_5: ; %Flow41
+; CHECK-NEXT:  .LBB0_5: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_7 Depth=1
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s49
-; CHECK-NEXT:  .LBB0_6: ; %Flow42
+; CHECK-NEXT:  .LBB0_6: ; %Flow44
 ; CHECK-NEXT:    ; in Loop: Header=BB0_7 Depth=1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s48, v45
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v47
@@ -304,7 +304,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ds_write_b32 v0, v58
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:    s_branch .LBB0_9
-; CHECK-NEXT:  .LBB0_18: ; %Flow43
+; CHECK-NEXT:  .LBB0_18: ; %Flow45
 ; CHECK-NEXT:    ; in Loop: Header=BB0_7 Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v57, v0
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s52
@@ -357,7 +357,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ds_write_b32 v0, v57
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s53
 ; CHECK-NEXT:    s_branch .LBB0_21
-; CHECK-NEXT:  .LBB0_24: ; %Flow47
+; CHECK-NEXT:  .LBB0_24: ; %Flow49
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s42
 ; CHECK-NEXT:  .LBB0_25:
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -382,13 +382,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_cmov_b32 exec_lo, vcc_lo
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_34
 ; CHECK-NEXT:  ; %bb.26:
-; CHECK-NEXT:    s_add_u32 s42, s44, 8
-; CHECK-NEXT:    s_addc_u32 s43, s45, 0
-; CHECK-NEXT:    s_mov_b32 s44, 0
+; CHECK-NEXT:    s_mov_b32 s42, 0
 ; CHECK-NEXT:    s_branch .LBB0_29
-; CHECK-NEXT:  .LBB0_27: ; %Flow38
+; CHECK-NEXT:  .LBB0_27: ; %Flow40
 ; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s45
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s43
 ; CHECK-NEXT:  .LBB0_28: ; in Loop: Header=BB0_29 Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
@@ -405,13 +403,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    v_add_co_u32 v41, vcc_lo, v0, v41
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v41
-; CHECK-NEXT:    s_or_b32 s44, vcc_lo, s44
-; CHECK-NEXT:    s_andn2_b32 s4, exec_lo, s44
-; CHECK-NEXT:    s_cselect_b32 exec_lo, s4, s44
+; CHECK-NEXT:    s_or_b32 s42, vcc_lo, s42
+; CHECK-NEXT:    s_andn2_b32 s4, exec_lo, s42
+; CHECK-NEXT:    s_cselect_b32 exec_lo, s4, s42
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_34
 ; CHECK-NEXT:  .LBB0_29: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v41
-; CHECK-NEXT:    s_mov_b32 s45, exec_lo
+; CHECK-NEXT:    s_mov_b32 s43, exec_lo
 ; CHECK-NEXT:    ds_read_b32 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v63, 10, v0
@@ -420,15 +418,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 0x180, v63
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 5, v62
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 5, v72
-; CHECK-NEXT:    v_add_co_u32 v2, s4, s42, v1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, s43, 0, s4
+; CHECK-NEXT:    v_add_co_u32 v2, s4, s44, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, s45, 0, s4
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:8
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_xor_b32_e32 v46, v9, v5
 ; CHECK-NEXT:    v_xor_b32_e32 v45, v8, v4
@@ -442,8 +440,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_28
 ; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_29 Depth=1
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:24
+; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:24
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v45
 ; CHECK-NEXT:    v_alignbit_b32 v1, v46, v45, 12
 ; CHECK-NEXT:    v_and_b32_e32 v2, 0xf0000, v45