[SepGEP] Reorder trivial GEP chains to separate constants

jrbyrnes · jrbyrnes · commit 30f4ac115a7f · 2024-02-07T10:06:28.000-08:00
Change-Id: I813c3c402093fc73bed70a50cdfa24d396e1b771
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -391,6 +391,11 @@ class SeparateConstOffsetFromGEP {
   /// and returns true if the splitting succeeds.
   bool splitGEP(GetElementPtrInst *GEP);
 
+  /// Tries to reorder the given GEP with the GEP that produces the base if
+  /// doing so results in producing a constant offset as the outermost
+  /// index.
+  bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
+
   /// Lower a GEP with multiple indices into multiple GEPs with a single index.
   /// Function splitGEP already split the original GEP into a variadic part and
   /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -964,6 +969,51 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
   Variadic->eraseFromParent();
 }
 
+bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
+                                            TargetTransformInfo &TTI) {
+  Type *GEPType = GEP->getResultElementType();
+  // TODO: support reordering for non-trivial GEP chains
+  if (GEPType->isAggregateType() || GEP->getNumIndices() != 1 ||
+      GEP->isInBounds())
+    return false;
+
+  auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
+  if (!PtrGEP || PtrGEP->isInBounds())
+    return false;
+  Type *PtrGEPType = PtrGEP->getResultElementType();
+  // TODO: support reordering for non-trivial GEP chains
+  if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
+    return false;
+
+  // TODO: support reordering for non-trivial GEP chains
+  if (PtrGEPType != GEPType ||
+      PtrGEP->getSourceElementType() != GEP->getSourceElementType())
+    return false;
+
+  bool NestedNeedsExtraction;
+  int64_t NestedByteOffset =
+      accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
+  if (!NestedNeedsExtraction)
+    return false;
+
+  unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
+  if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
+                                 /*BaseGV=*/nullptr, NestedByteOffset,
+                                 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
+    return false;
+
+  IRBuilder<> Builder(GEP);
+  Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
+  // For trivial GEP chains, we can swap the indicies.
+  auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
+                                  SmallVector<Value *, 4>(GEP->indices()));
+  auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
+                                  SmallVector<Value *, 4>(PtrGEP->indices()));
+  GEP->replaceAllUsesWith(NewGEP);
+  RecursivelyDeleteTriviallyDeadInstructions(GEP);
+  return true;
+}
+
 bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // Skip vector GEPs.
   if (GEP->getType()->isVectorTy())
@@ -979,11 +1029,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   bool NeedsExtraction;
   int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
 
-  if (!NeedsExtraction)
-    return Changed;
-
   TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
 
+  if (!NeedsExtraction) {
+    Changed |= reorderGEP(GEP, TTI);
+    return Changed;
+  }
+
   // If LowerGEP is disabled, before really splitting the GEP, check whether the
   // backend supports the addressing mode we are about to produce. If no, this
   // splitting probably won't be beneficial.
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
diff --git a/llvm/test/CodeGen/PowerPC/licm-remat.ll b/llvm/test/CodeGen/PowerPC/licm-remat.ll
@@ -21,7 +21,7 @@ define linkonce_odr void @ZN6snappyDecompressor_(ptr %this, ptr %writer) {
 ; CHECK-LABEL: ZN6snappyDecompressor_:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK:       addis 4, 2, .L__ModuleStringPool@toc@ha
-; CHECK:       addi 25, 4, .L__ModuleStringPool@toc@l
+; CHECK:       addi 26, 4, .L__ModuleStringPool@toc@l
 ; CHECK:       .LBB0_2: # %for.cond
 ; CHECK-NOT:   addis {{[0-9]+}}, 2, .L__ModuleStringPool@toc@ha
 ; CHECK:       bctrl
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=separate-const-offset-from-gep < %s | FileCheck %s
+
+define void @inbounds(ptr %in.ptr, i64 %in.idx0, i64 %in.idx1, i64 %in.idx2) {
+; CHECK-LABEL: define void @inbounds(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]], i64 [[IN_IDX2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr inbounds <16 x i16>, ptr [[IN_PTR]], i64 2
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds <16 x i16>, ptr [[CONST1]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr <16 x i16>, ptr [[IN_PTR]], i64 24
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds <16 x i16>, ptr [[CONST2]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr inbounds <16 x i16>, ptr [[IN_PTR]], i64 6
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr <16 x i16>, ptr [[CONST3]], i64 [[IN_IDX2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <16 x i16>, ptr [[IN_PTR]], i64 [[IN_IDX2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <16 x i16>, ptr [[TMP0]], i64 28
+; CHECK-NEXT:    ret void
+;
+entry:
+  %const1 = getelementptr inbounds <16 x i16>, ptr %in.ptr, i64 2
+  %idx1 = getelementptr inbounds <16 x i16>, ptr %const1, i64 %in.idx1
+  %const2 = getelementptr <16 x i16>, ptr %in.ptr, i64 24
+  %idx2 = getelementptr inbounds <16 x i16>, ptr %const2, i64 %in.idx1
+  %const3 = getelementptr inbounds <16 x i16>, ptr %in.ptr, i64 6
+  %idx3 = getelementptr <16 x i16>, ptr %const3, i64 %in.idx2
+  %const4 = getelementptr <16 x i16>, ptr %in.ptr, i64 28
+  %idx4 = getelementptr <16 x i16>, ptr %const4, i64 %in.idx2
+  ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --start-before=separate-const-offset-from-gep < %s | FileCheck %s
+
+define protected amdgpu_kernel void @sink_addr(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: sink_addr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
+; CHECK-NEXT:    s_add_i32 s0, s0, s3
+; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
+; CHECK-NEXT:    s_add_i32 s0, s0, s2
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v12
+; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:512
+; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:1024
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:1536
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB0_2: ; %end
+; CHECK-NEXT:    s_add_i32 s1, s0, 0x200
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_add_i32 s2, s0, 0x400
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_add_i32 s3, s0, 0x600
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i32 256
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i32 512
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i32 768
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
+
+define protected amdgpu_kernel void @illegal_addr_mode(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: illegal_addr_mode:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s0, s5, 1
+; CHECK-NEXT:    s_lshl_b32 s1, s6, 1
+; CHECK-NEXT:    s_add_i32 s3, s4, s0
+; CHECK-NEXT:    s_add_i32 s3, s3, s1
+; CHECK-NEXT:    s_add_i32 s2, s3, 0x12a60
+; CHECK-NEXT:    s_add_i32 s1, s3, 0x12c60
+; CHECK-NEXT:    s_add_i32 s0, s3, 0x12ed8
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    v_mov_b32_e32 v4, s2
+; CHECK-NEXT:    v_mov_b32_e32 v8, s1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v0
+; CHECK-NEXT:    ds_read_b128 v[4:7], v4
+; CHECK-NEXT:    ds_read_b128 v[8:11], v8
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_2: ; %end
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i32 38192
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i32 38448
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i32 38764
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -S -passes=separate-const-offset-from-gep  < %s | FileCheck %s
+
+define protected amdgpu_kernel void @sink_addr(ptr %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define protected amdgpu_kernel void @sink_addr(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 256
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i64 512
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i64 768
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
+; CHECK-NEXT:    br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
+; CHECK:       bb.1:
+; CHECK-NEXT:    [[VAL0:%.*]] = load <8 x i64>, ptr [[IDX0]], align 16
+; CHECK-NEXT:    [[VAL1:%.*]] = load <8 x i64>, ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[VAL2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[VAL3:%.*]] = load <8 x i64>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+entry:
+  %idx0 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 %in.idx1
+  %const1 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 256
+  %idx1 = getelementptr i64, ptr  %const1, i64 %in.idx1
+  %const2 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 512
+  %idx2 = getelementptr i64, ptr  %const2, i64 %in.idx1
+  %const3 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 768
+  %idx3 = getelementptr i64, ptr  %const3, i64 %in.idx1
+  %cmp0 = icmp eq i64 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x i64>, ptr  %idx0, align 16
+  %val1 = load <8 x i64>, ptr  %idx1, align 16
+  %val2 = load <8 x i64>, ptr  %idx2, align 16
+  %val3 = load <8 x i64>, ptr  %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr  %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr  %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr  %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr  %idx3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll