[AMDGPU] Extended vector promotion to aggregate types. #143784

zGoldthorpe · 2025-06-11T21:08:45Z

Extends the amdgpu-promote-alloca-to-vector pass to also promote aggregate types whose elements are all the same type to vector registers.

The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.

llvmbot · 2025-06-11T21:09:18Z

@llvm/pr-subscribers-backend-amdgpu

Author: None (zGoldthorpe)

Changes

Extends the amdgpu-promote-alloca-to-vector pass to also promote aggregate types whose elements are all the same type to vector registers.

The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.

Full diff: https://github.com/llvm/llvm-project/pull/143784.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+55-41)
(added) llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll (+263)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 700dc87d2f821..336e3a1db7e73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,6 +818,28 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
+/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
+/// type is non-homogeneous.
+static Type *getHomogeneousType(Type *Ty) {
+  if (auto *VectorTy = dyn_cast<FixedVectorType>(Ty))
+    return VectorTy->getElementType();
+  if (auto *ArrayTy = dyn_cast<ArrayType>(Ty))
+    return getHomogeneousType(ArrayTy->getElementType());
+  if (auto *StructTy = dyn_cast<StructType>(Ty)) {
+    if (StructTy->getNumElements() == 0)
+      return nullptr;
+
+    auto *Iter = StructTy->element_begin();
+    Type *HTy = getHomogeneousType(*Iter);
+    for (; Iter != StructTy->element_end(); ++Iter)
+      if (getHomogeneousType(*Iter) != HTy)
+        return nullptr;
+
+    return HTy;
+  }
+  return Ty;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -828,42 +850,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   Type *AllocaTy = Alloca.getAllocatedType();
-  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
-  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
-    uint64_t NumElems = 1;
-    Type *ElemTy;
-    do {
-      NumElems *= ArrayTy->getNumElements();
-      ElemTy = ArrayTy->getElementType();
-    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
-
-    // Check for array of vectors
-    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
-    if (InnerVectorTy) {
-      NumElems *= InnerVectorTy->getNumElements();
-      ElemTy = InnerVectorTy->getElementType();
-    }
+  Type *ElemTy = getHomogeneousType(AllocaTy);
 
-    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
-      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      if (ElementSize > 0) {
-        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-        // Expand vector if required to match padding of inner type,
-        // i.e. odd size subvectors.
-        // Storage size of new vector must match that of alloca for correct
-        // behaviour of byte offsets and GEP computation.
-        if (NumElems * ElementSize != AllocaSize)
-          NumElems = AllocaSize / ElementSize;
-        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-          VectorTy = FixedVectorType::get(ElemTy, NumElems);
-      }
-    }
+  if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
+    return false;
   }
 
-  if (!VectorTy) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
+  if (ElementSizeInBits == 0) {
+    LLVM_DEBUG(dbgs() << "  Cannot create vector of zero-sized elements.");
+    return false;
+  }
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
     return false;
   }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  if (ElementSize == 0)
+    return false;
+
+  // Calculate the size of the corresponding vector, accounting for padding of
+  // inner types, e.g., odd-sized subvectors. Storage size of new vector must
+  // match that of alloca for correct behaviour of byte offsets and GEP
+  // computation.
+  unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+  unsigned NumElems = AllocaSize / ElementSize;
+  if (NumElems == 0) {
+    LLVM_DEBUG(dbgs() << "  Cannot vectorize an empty aggregate type.");
+    return false;
+  }
+  if (NumElems * ElementSize != AllocaSize) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert type into vector of the same size.");
+    return false;
+  }
+  auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
+  assert(VectorTy && "Failed to create vector type.");
 
   const unsigned MaxElements =
       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
@@ -895,15 +918,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
-  Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
-  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
@@ -943,7 +957,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
new file mode 100644
index 0000000000000..d09f6ba1e7b68
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 %s | FileCheck %s
+
+declare void @clobber_i8(i8)
+
+define void @test_v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <4 x i8>, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [4 x i8], align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_a2v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_a2v3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2v3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_a2a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_a2a3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2a3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s1v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s1v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {<4 x i8>}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s1a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s1a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {[4 x i8]}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s2v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s2v2i8v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v2i8v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s2v2i8v3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v2i8v3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s2s2i8s4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2s2i8s4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s2s2i8s3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2s2i8s3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <5 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <5 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+define void @test_s3i8s1i8v2i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s3i8s1i8v2i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+; heterogeneous element types are not supported
+define void @test_heterogeneous(i64 %idx) {
+; CHECK-LABEL: define void @test_heterogeneous(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT:    call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}
+
+; empty structs are not supported
+define void @test_empty(i64 %idx) {
+; CHECK-LABEL: define void @test_empty(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT:    call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {i8, {}}, align 4, addrspace(5)
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  call void @clobber_i8(i8 %val)
+  ret void
+}

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll

arsenm · 2025-06-12T08:38:20Z

llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll

+; CHECK-NEXT:    call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)


I thought SROA already tried to flatten out aggregate into simple arrays. Why do we need to do this? We don't need to optimally handle all IR, just post-optimized IR

This behaviour is actually already implemented in the AMDGPUPromoteAllocaToVector pass. I put the test in to ensure this behaviour didn't change with this PR.

If I understand correctly, the problem with SROA is that it tends to prefer promoting types to scalar registers, rather than vectors, which is why the AMDGPUPromoteAllocaToVector pass is queued to occur before SROA in the pipeline.

If SROA can break the value up it's better, we should be running this after SROA

llvm-ci · 2025-06-13T23:25:15Z

LLVM Buildbot has detected a new failure on builder ppc64le-flang-rhel-clang running on ppc64le-flang-rhel-test while building llvm at step 6 "test-build-unified-tree-check-flang".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/157/builds/30754

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-flang) failure: test (failure)
******************** TEST 'Flang :: Semantics/modfile75.F90' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/flang -c -fhermetic-module-files -DWHICH=1 /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90 && /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/flang -c -fhermetic-module-files -DWHICH=2 /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90 && /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/flang -fc1 -fdebug-unparse /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90 | /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/FileCheck /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90 # RUN: at line 1
+ /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/flang -c -fhermetic-module-files -DWHICH=1 /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90
+ /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/flang -c -fhermetic-module-files -DWHICH=2 /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90
+ /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/flang -fc1 -fdebug-unparse /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90
+ /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/FileCheck /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90
error: Semantic errors in /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90
/home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90:15:11: error: Must be a constant value
    integer(c_int) n
            ^^^^^
FileCheck error: '<stdin>' is empty.
FileCheck command line:  /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/build/bin/FileCheck /home/buildbots/llvm-external-buildbots/workers/ppc64le-flang-rhel-test/ppc64le-flang-rhel-clang-build/llvm-project/flang/test/Semantics/modfile75.F90

--

********************

…143784)" This reverts commit 79e06bf.

Extends the `amdgpu-promote-alloca-to-vector` pass to also promote aggregate types whose elements are all the same type to vector registers. The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.

…143784)" This reverts commit 79e06bf.

) Reverts #143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.

…pes." (#144366) Reverts llvm/llvm-project#143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.

…#144366) Reverts llvm#143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.

Extends the `amdgpu-promote-alloca-to-vector` pass to also promote aggregate types whose elements are all the same type to vector registers. The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.

…#144366) Reverts llvm#143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.

Extended vector promotion to aggregate types.

84c932d

llvmbot added the backend:AMDGPU label Jun 11, 2025

shiltian reviewed Jun 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Outdated Show resolved Hide resolved

shiltian reviewed Jun 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Outdated Show resolved Hide resolved

shiltian requested review from arsenm and Pierre-vh June 11, 2025 21:15

Refactored away recursion.

ed920b7

shiltian approved these changes Jun 12, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Show resolved Hide resolved

shiltian reviewed Jun 12, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll Outdated Show resolved Hide resolved

Minor revisions.

90dc72d

arsenm reviewed Jun 12, 2025

View reviewed changes

Improved test suite.

86a685e

arsenm approved these changes Jun 13, 2025

View reviewed changes

shiltian merged commit 79e06bf into llvm:main Jun 13, 2025
7 checks passed

zGoldthorpe deleted the pr/homogeneous-aggregates branch June 13, 2025 19:38

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Jun 14, 2025

Revert "[AMDGPU] Extended vector promotion to aggregate types. (llvm#…

a60a203

…143784)" This reverts commit 79e06bf.

zGoldthorpe restored the pr/homogeneous-aggregates branch June 16, 2025 13:58

zGoldthorpe added a commit to zGoldthorpe/llvm-project that referenced this pull request Jun 16, 2025

Revert "[AMDGPU] Extended vector promotion to aggregate types. (llvm#…

43ade5f

…143784)" This reverts commit 79e06bf.

zGoldthorpe mentioned this pull request Jun 16, 2025

Revert "[AMDGPU] Extended vector promotion to aggregate types." #144366

Merged

shiltian pushed a commit that referenced this pull request Jun 16, 2025

Revert "[AMDGPU] Extended vector promotion to aggregate types." (#144366

4692f0d

) Reverts #143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Extended vector promotion to aggregate types. #143784

[AMDGPU] Extended vector promotion to aggregate types. #143784

Uh oh!

zGoldthorpe commented Jun 11, 2025

Uh oh!

llvmbot commented Jun 11, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Jun 12, 2025

Uh oh!

zGoldthorpe Jun 12, 2025

Uh oh!

arsenm Jun 13, 2025

Uh oh!

Uh oh!

llvm-ci commented Jun 13, 2025

Uh oh!

Uh oh!

[AMDGPU] Extended vector promotion to aggregate types. #143784

[AMDGPU] Extended vector promotion to aggregate types. #143784

Uh oh!

Conversation

zGoldthorpe commented Jun 11, 2025

Uh oh!

llvmbot commented Jun 11, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Jun 12, 2025

Choose a reason for hiding this comment

Uh oh!

zGoldthorpe Jun 12, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Jun 13, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Jun 13, 2025

Uh oh!

Uh oh!