-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU] Extended vector promotion to aggregate types. #143784
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: None (zGoldthorpe) ChangesExtends the The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general. Full diff: https://github.com/llvm/llvm-project/pull/143784.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 700dc87d2f821..336e3a1db7e73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,6 +818,28 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
return I;
}
+/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
+/// type is non-homogeneous.
+static Type *getHomogeneousType(Type *Ty) {
+ if (auto *VectorTy = dyn_cast<FixedVectorType>(Ty))
+ return VectorTy->getElementType();
+ if (auto *ArrayTy = dyn_cast<ArrayType>(Ty))
+ return getHomogeneousType(ArrayTy->getElementType());
+ if (auto *StructTy = dyn_cast<StructType>(Ty)) {
+ if (StructTy->getNumElements() == 0)
+ return nullptr;
+
+ auto *Iter = StructTy->element_begin();
+ Type *HTy = getHomogeneousType(*Iter);
+ for (; Iter != StructTy->element_end(); ++Iter)
+ if (getHomogeneousType(*Iter) != HTy)
+ return nullptr;
+
+ return HTy;
+ }
+ return Ty;
+}
+
// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -828,42 +850,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
}
Type *AllocaTy = Alloca.getAllocatedType();
- auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
- if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
- uint64_t NumElems = 1;
- Type *ElemTy;
- do {
- NumElems *= ArrayTy->getNumElements();
- ElemTy = ArrayTy->getElementType();
- } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
-
- // Check for array of vectors
- auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
- if (InnerVectorTy) {
- NumElems *= InnerVectorTy->getNumElements();
- ElemTy = InnerVectorTy->getElementType();
- }
+ Type *ElemTy = getHomogeneousType(AllocaTy);
- if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
- unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
- if (ElementSize > 0) {
- unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
- // Expand vector if required to match padding of inner type,
- // i.e. odd size subvectors.
- // Storage size of new vector must match that of alloca for correct
- // behaviour of byte offsets and GEP computation.
- if (NumElems * ElementSize != AllocaSize)
- NumElems = AllocaSize / ElementSize;
- if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
- VectorTy = FixedVectorType::get(ElemTy, NumElems);
- }
- }
+ if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
+ LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
+ return false;
}
- if (!VectorTy) {
- LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
+ unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
+ if (ElementSizeInBits == 0) {
+ LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements.");
+ return false;
+ }
+ if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
+ LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
+ "does not match the type's size\n");
return false;
}
+ unsigned ElementSize = ElementSizeInBits / 8;
+ if (ElementSize == 0)
+ return false;
+
+ // Calculate the size of the corresponding vector, accounting for padding of
+ // inner types, e.g., odd-sized subvectors. Storage size of new vector must
+ // match that of alloca for correct behaviour of byte offsets and GEP
+ // computation.
+ unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+ unsigned NumElems = AllocaSize / ElementSize;
+ if (NumElems == 0) {
+ LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type.");
+ return false;
+ }
+ if (NumElems * ElementSize != AllocaSize) {
+ LLVM_DEBUG(dbgs() << " Cannot convert type into vector of the same size.");
+ return false;
+ }
+ auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
+ assert(VectorTy && "Failed to create vector type.");
const unsigned MaxElements =
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
@@ -895,15 +918,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
- Type *VecEltTy = VectorTy->getElementType();
- unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
- if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
- LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
- "does not match the type's size\n");
- return false;
- }
- unsigned ElementSize = ElementSizeInBits / 8;
- assert(ElementSize > 0);
for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());
@@ -943,7 +957,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
- Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+ Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
new file mode 100644
index 0000000000000..d09f6ba1e7b68
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 %s | FileCheck %s
+
+declare void @clobber_i8(i8)
+
+define void @test_v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca <4 x i8>, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [4 x i8], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2v3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2v3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2a3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2a3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s1v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s1v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<4 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s1a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s1a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {[4 x i8]}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2v2i8v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v2i8v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2v2i8v3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v2i8v3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2s2i8s4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2s2i8s4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2s2i8s3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2s2i8s3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <5 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <5 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s3i8s1i8v2i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s3i8s1i8v2i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+; heterogeneous element types are not supported
+define void @test_heterogeneous(i64 %idx) {
+; CHECK-LABEL: define void @test_heterogeneous(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+; empty structs are not supported
+define void @test_empty(i64 %idx) {
+; CHECK-LABEL: define void @test_empty(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, {}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
|
; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) | ||
; CHECK-NEXT: ret void | ||
; | ||
%stack = alloca [2 x <4 x i8>], align 4, addrspace(5) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought SROA already tried to flatten out aggregate into simple arrays. Why do we need to do this? We don't need to optimally handle all IR, just post-optimized IR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This behaviour is actually already implemented in the AMDGPUPromoteAllocaToVector pass. I put the test in to ensure this behaviour didn't change with this PR.
If I understand correctly, the problem with SROA is that it tends to prefer promoting types to scalar registers, rather than vectors, which is why the AMDGPUPromoteAllocaToVector pass is queued to occur before SROA in the pipeline.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If SROA can break the value up it's better, we should be running this after SROA
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/157/builds/30754 Here is the relevant piece of the build log for the reference
|
Extends the `amdgpu-promote-alloca-to-vector` pass to also promote aggregate types whose elements are all the same type to vector registers. The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.
) Reverts #143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.
…pes." (#144366) Reverts llvm/llvm-project#143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.
…#144366) Reverts llvm#143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.
Extends the `amdgpu-promote-alloca-to-vector` pass to also promote aggregate types whose elements are all the same type to vector registers. The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.
…#144366) Reverts llvm#143784 Patch fails some internal tests. Will investigate more thoroughly before attempting to remerge.
Extends the
amdgpu-promote-alloca-to-vector
pass to also promote aggregate types whose elements are all the same type to vector registers.The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general.