[AMDGPU] Avoid crashes for non-byte-sized types in PromoteAlloca (#134042)

ritter-x2a · web-flow · commit cf188d650ce2 · 2025-04-14T09:13:54.000+02:00
This patch addresses three problems when promoting allocas to vectors:
- Element types with size &lt; 1 byte in allocas with a vector type caused
  divisions by zero.
- Element types whose size doesn't match their AllocSize hit an assertion.
- Access types whose size doesn't match their AllocSize hit an assertion.

With this patch, we do not attempt to promote affected allocas to vectors. In
principle, we could handle these cases in PromoteAlloca, e.g., by truncating
and extending elements from/to their allocation size. It's however unclear if
we ever encounter such cases in practice, so that doesn't seem worth the added
complexity.

For SWDEV-511252
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -729,6 +729,11 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
   // complicated.
   if (isa<FixedVectorType>(AccessTy)) {
     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
+    // If the type size and the store size don't match, we would need to do more
+    // than just bitcast to translate between an extracted/insertable subvectors
+    // and the accessed value.
+    if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
+      return false;
     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
     return AccTS.isKnownMultipleOf(VecTS);
   }
@@ -813,15 +818,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
       unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-      // Expand vector if required to match padding of inner type,
-      // i.e. odd size subvectors.
-      // Storage size of new vector must match that of alloca for correct
-      // behaviour of byte offsets and GEP computation.
-      if (NumElems * ElementSize != AllocaSize)
-        NumElems = AllocaSize / ElementSize;
-      if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-        VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
     }
   }
 
@@ -861,7 +868,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Check that types where the store/allocation sizes don't match the type size
+; don't crash.
+
+
+define <7 x i9> @load_elem_i9_access_7xi9() {
+; CHECK-LABEL: @load_elem_i9_access_7xi9(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i9>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <7 x i9>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <7 x i9> [[L]]
+;
+  %p = alloca <16 x i9>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <7 x i9>, ptr addrspace(5) %g, align 1
+  ret <7 x i9> %l
+}
+
+define <8 x i1> @load_elem_i1_access_8xi1() {
+; CHECK-LABEL: @load_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @load_elem_i1_access_3xi1() {
+; CHECK-LABEL: @load_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @load_elem_i8_access_3xi1() {
+; CHECK-LABEL: @load_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @load_elem_i8_access_8xi1() {
+; CHECK-LABEL: @load_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @storeload_elem_i1_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i1_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i8_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @storeload_elem_i8_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @array_of_vec_elem_i1_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i1_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i8_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <8 x i8>], align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @array_of_vec_elem_i8_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <16 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> [[P]], i8 1, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 2, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 3, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 4, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 5, i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 6, i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 7, i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 8, i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 5, i32 4
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}