llvm · sgundapa · Feb 24, 2025 · Jan 9, 2025 · Jan 30, 2025 · Feb 10, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -385,16 +385,18 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
 }
 
-static Value *
-calculateVectorIndex(Value *Ptr,
-                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
+static Value *calculateVectorIndex(
+    Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
   if (!GEP)
     return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
 
   auto I = GEPIdx.find(GEP);
   assert(I != GEPIdx.end() && "Must have entry for GEP!");
-  return I->second;
+
+  Value *IndexValue = I->second;
+  assert(IndexValue && "index value missing from GEP index map");
+  return IndexValue;
 }
 
 static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
@@ -449,7 +451,7 @@ static Value *promoteAllocaUserToVector(
     Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
     unsigned VecStoreSize, unsigned ElementSize,
     DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
-    std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
+    std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
     SmallVectorImpl<LoadInst *> &DeferredLoads) {
   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
   // to do more folding, especially in the case of vector splats.
@@ -757,7 +759,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     return false;
   }
 
-  std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
+  std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -122,6 +122,34 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
   ret void
 }
 
+%Block4 = type { [2 x i32], i32 }
+@block4 = external addrspace(1) global %Block4
+%gl_PV = type { <4 x i32>, i32, [1 x i32], [1 x i32] }
+@pv1 = external addrspace(1) global %gl_PV
+
+; This should not crash on an aliased variable offset that can be
+; optimized out (variable %aliasTofoo3 in the test)
+define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
+; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
+; CHECK-NEXT:    [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x i32> %input, i32 [[TMP2]], i64 3
+; CHECK-NEXT:    store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
+; CHECK-NEXT:    ret void
+;
+  %f1 = alloca [3 x i32], align 4, addrspace(5)
+  %G1 = getelementptr inbounds i8, ptr addrspace(5) %f1, i32 8
+  %foo3.unpack2 = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
+  store i32 %foo3.unpack2, ptr addrspace(5) %G1, align 4
+  %aliasTofoo3 = load i32, ptr addrspace(5) %G1, align 4
+  %foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %aliasTofoo3
+  %foo6 = load i32, ptr addrspace(5) %foo5, align 4
+  %foo12 = insertelement <4 x i32> %input, i32 %foo6, i64 3
+  store <4 x i32> %foo12, ptr addrspace(1) @pv1, align 16
+  ret void
+}
+
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
 ; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4