Skip to content

Commit 28b5054

Browse files
authored
[AMDGPU] Fix PromoteAlloca size check of alloca for store (#72528)
When storing a subvector, too many element were written when the size of the alloca is smaller than the size of the vector store. This patch checks for the minimum of the alloca vector and the store vector to determine the number of elements to store.
1 parent 84ebe5b commit 28b5054

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ static Value *promoteAllocaUserToVector(
469469
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
470470
const unsigned NumWrittenElts =
471471
AccessSize / DL.getTypeStoreSize(VecEltTy);
472+
const unsigned NumVecElts = VectorTy->getNumElements();
472473
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
473474
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
474475

@@ -480,7 +481,8 @@ static Value *promoteAllocaUserToVector(
480481
Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
481482

482483
Value *CurVec = GetOrLoadCurrentVectorValue();
483-
for (unsigned K = 0; K < NumWrittenElts; ++K) {
484+
for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
485+
K < NumElts; ++K) {
484486
Value *CurIdx =
485487
Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
486488
CurVec = Builder.CreateInsertElement(

llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,3 +458,30 @@ finally:
458458
%load = load <4 x i16>, ptr addrspace(5) %ptr.2, align 2
459459
ret <4 x i16> %load
460460
}
461+
462+
463+
; Check the case when the alloca is smaller than the vector size.
464+
define void @test_smaller_alloca_store(<4 x i32> %store1, <4 x i32> %store2) {
465+
; CHECK-LABEL: define void @test_smaller_alloca_store
466+
; CHECK-SAME: (<4 x i32> [[STORE1:%.*]], <4 x i32> [[STORE2:%.*]]) {
467+
; CHECK-NEXT: entry:
468+
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[STORE1]], i64 0
469+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[TMP0]], i32 0
470+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[STORE1]], i64 1
471+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[TMP2]], i32 1
472+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[STORE1]], i64 2
473+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[TMP4]], i32 2
474+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[STORE2]], i64 0
475+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[TMP6]], i32 0
476+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[STORE2]], i64 1
477+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i32> [[TMP7]], i32 [[TMP8]], i32 1
478+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[STORE2]], i64 2
479+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x i32> [[TMP9]], i32 [[TMP10]], i32 2
480+
; CHECK-NEXT: ret void
481+
;
482+
entry:
483+
%res = alloca <3 x i32>, align 16, addrspace(5)
484+
store <4 x i32> %store1, ptr addrspace(5) %res, align 16
485+
store <4 x i32> %store2, ptr addrspace(5) %res, align 16
486+
ret void
487+
}

0 commit comments

Comments
 (0)