Skip to content

Commit 2f470b0

Browse files
committed
[AMDGPU] Handle addrspacecast of vector of ptrs.
1 parent 65c0bf4 commit 2f470b0

File tree

2 files changed

+134
-19
lines changed

2 files changed

+134
-19
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ class AMDGPUSwLowerLDS {
192192
void getLDSMemoryInstructions(Function *Func,
193193
SetVector<Instruction *> &LDSInstructions);
194194
void replaceKernelLDSAccesses(Function *Func);
195-
Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr,
195+
Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
196196
Value *LDSPtr);
197197
void translateLDSMemoryOperationsToGlobalMemory(
198198
Function *Func, Value *LoadMallocPtr,
@@ -655,9 +655,9 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
655655
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
656656
if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
657657
LDSInstructions.insert(&Inst);
658-
} else if (AddrSpaceCastInst *AscI = dyn_cast<AddrSpaceCastInst>(&Inst)) {
659-
if ((AscI->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) &&
660-
(AscI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS))
658+
} else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) {
659+
if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
660+
ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
661661
LDSInstructions.insert(&Inst);
662662
} else
663663
continue;
@@ -666,13 +666,28 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
666666
}
667667

668668
Value *
669-
AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr,
669+
AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
670670
Value *LDSPtr) {
671671
assert(LDSPtr && "Invalid LDS pointer operand");
672+
Type *LDSPtrType = LDSPtr->getType();
673+
674+
if (LDSPtrType->isVectorTy()) {
675+
// Handle vector of pointers
676+
VectorType *VecPtrTy = cast<VectorType>(LDSPtrType);
677+
ElementCount NumElements = VecPtrTy->getElementCount();
678+
Type *Int32VecTy = VectorType::get(IRB.getInt32Ty(), NumElements);
679+
Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, Int32VecTy);
680+
Type *GlobalPtrVecTy = VectorType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumElements);
681+
Value *GlobalPtrVec = PoisonValue::get(GlobalPtrVecTy);
682+
for (uint64_t Index = 0; Index < NumElements.getKnownMinValue(); ++Index) {
683+
Value *ExtElem = IRB.CreateExtractElement(PtrToInt, Index);
684+
Value *Gep = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {ExtElem});
685+
GlobalPtrVec = IRB.CreateInsertElement(GlobalPtrVec, Gep, Index);
686+
}
687+
return GlobalPtrVec;
688+
}
672689
Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty());
673-
Value *GEP =
674-
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt});
675-
return GEP;
690+
return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt});
676691
}
677692

678693
void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
@@ -685,7 +700,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
685700
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
686701
Value *LIOperand = LI->getPointerOperand();
687702
Value *Replacement =
688-
getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand);
703+
getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
689704
LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
690705
LI->getAlign(), LI->isVolatile());
691706
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
@@ -695,7 +710,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
695710
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
696711
Value *SIOperand = SI->getPointerOperand();
697712
Value *Replacement =
698-
getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand);
713+
getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
699714
StoreInst *NewSI = IRB.CreateAlignedStore(
700715
SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
701716
NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
@@ -705,7 +720,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
705720
} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
706721
Value *RMWPtrOperand = RMW->getPointerOperand();
707722
Value *RMWValOperand = RMW->getValOperand();
708-
Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
723+
Value *Replacement = getTranslatedGlobalMemoryPtrOfLDS(
709724
LoadMallocPtr, RMWPtrOperand);
710725
AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
711726
RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
@@ -716,7 +731,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
716731
RMW->eraseFromParent();
717732
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
718733
Value *XCHGPtrOperand = XCHG->getPointerOperand();
719-
Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
734+
Value *Replacement = getTranslatedGlobalMemoryPtrOfLDS(
720735
LoadMallocPtr, XCHGPtrOperand);
721736
AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
722737
Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
@@ -726,16 +741,16 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
726741
AsanInfo.Instructions.insert(NewXCHG);
727742
XCHG->replaceAllUsesWith(NewXCHG);
728743
XCHG->eraseFromParent();
729-
} else if (AddrSpaceCastInst *AscI = dyn_cast<AddrSpaceCastInst>(Inst)) {
730-
Value *AIOperand = AscI->getPointerOperand();
731-
Value *Gep =
732-
getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, AIOperand);
733-
Value *NewAI = IRB.CreateAddrSpaceCast(Gep, AscI->getType());
744+
} else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) {
745+
Value *AIOperand = ASC->getPointerOperand();
746+
Value *Replacement =
747+
getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
748+
Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType());
734749
// Note: No need to add the instruction to AsanInfo instructions to be
735750
// instrumented list. FLAT_ADDRESS ptr would have been already
736751
// instrumented by asan pass prior to this pass.
737-
AscI->replaceAllUsesWith(NewAI);
738-
AscI->eraseFromParent();
752+
ASC->replaceAllUsesWith(NewAI);
753+
ASC->eraseFromParent();
739754
} else
740755
report_fatal_error("Unimplemented LDS lowering instruction");
741756
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
2+
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
3+
4+
; Test to check if vector of static LDS ptrs accesses in kernel are lowered correctly.
5+
@lds_var1 = internal addrspace(3) global i32 poison
6+
@lds_var2 = internal addrspace(3) global i32 poison
7+
8+
;.
9+
; CHECK: @llvm.amdgcn.sw.lds.example = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]]
10+
; CHECK: @llvm.amdgcn.sw.lds.example.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.example.md.type { %llvm.amdgcn.sw.lds.example.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address
11+
;.
12+
define amdgpu_kernel void @example() sanitize_address {
13+
; CHECK-LABEL: define amdgpu_kernel void @example(
14+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
15+
; CHECK-NEXT: [[WID:.*]]:
16+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
17+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
18+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
19+
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
20+
; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
21+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
22+
; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[ENTRY:.*]]
23+
; CHECK: [[MALLOC]]:
24+
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4
25+
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 2), align 4
26+
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
27+
; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
28+
; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
29+
; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
30+
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]])
31+
; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
32+
; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8
33+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
34+
; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
35+
; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24)
36+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36
37+
; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64
38+
; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28)
39+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68
40+
; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64
41+
; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28)
42+
; CHECK-NEXT: br label %[[ENTRY]]
43+
; CHECK: [[ENTRY]]:
44+
; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ]
45+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
46+
; CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8
47+
; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 1, i32 0), align 4
48+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP21]]
49+
; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4
50+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP23]]
51+
; CHECK-NEXT: [[VEC_LDS_PTRS:%.*]] = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) [[TMP22]], i32 0
52+
; CHECK-NEXT: [[VEC_LDS_PTRS1:%.*]] = insertelement <2 x ptr addrspace(3)> [[VEC_LDS_PTRS]], ptr addrspace(3) [[TMP24]], i32 1
53+
; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VEC_LDS_PTRS1]] to <2 x i32>
54+
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[TMP25]], i64 0
55+
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 [[TMP26]]
56+
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP27]], i64 0
57+
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP25]], i64 1
58+
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 [[TMP29]]
59+
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x ptr addrspace(1)> [[TMP28]], ptr addrspace(1) [[TMP30]], i64 1
60+
; CHECK-NEXT: [[TMP32:%.*]] = addrspacecast <2 x ptr addrspace(1)> [[TMP31]] to <2 x ptr>
61+
; CHECK-NEXT: [[ELEM0:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 0
62+
; CHECK-NEXT: store i32 42, ptr [[ELEM0]], align 4
63+
; CHECK-NEXT: [[ELEM1:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 1
64+
; CHECK-NEXT: store i32 43, ptr [[ELEM1]], align 4
65+
; CHECK-NEXT: br label %[[CONDFREE:.*]]
66+
; CHECK: [[CONDFREE]]:
67+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
68+
; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]]
69+
; CHECK: [[FREE]]:
70+
; CHECK-NEXT: [[TMP33:%.*]] = call ptr @llvm.returnaddress(i32 0)
71+
; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr [[TMP33]] to i64
72+
; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64
73+
; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP35]], i64 [[TMP34]])
74+
; CHECK-NEXT: br label %[[END]]
75+
; CHECK: [[END]]:
76+
; CHECK-NEXT: ret void
77+
;
78+
entry:
79+
; Create a vector of flat pointers
80+
%vec_lds_ptrs = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) @lds_var1, i32 0
81+
%vec_lds_ptrs1 = insertelement <2 x ptr addrspace(3)> %vec_lds_ptrs, ptr addrspace(3) @lds_var2, i32 1
82+
%vec_flat_ptrs = addrspacecast <2 x ptr addrspace(3)> %vec_lds_ptrs1 to <2 x ptr>
83+
%elem0 = extractelement <2 x ptr> %vec_flat_ptrs, i32 0
84+
store i32 42, ptr %elem0, align 4
85+
%elem1 = extractelement <2 x ptr> %vec_flat_ptrs, i32 1
86+
store i32 43, ptr %elem1, align 4
87+
ret void
88+
}
89+
90+
!llvm.module.flags = !{!0}
91+
!0 = !{i32 4, !"nosanitize_address", i32 1}
92+
;.
93+
; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" }
94+
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
95+
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
96+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
97+
;.
98+
; CHECK: [[META0]] = !{i32 0, i32 1}
99+
; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
100+
;.

0 commit comments

Comments
 (0)