Skip to content

Commit 31ba4df

Browse files
zhaomaosukbenzie
authored andcommitted
[DevASAN] Do allocation with USM pool to reduce memory overhead
Release mapped physical memory according to its dependency may cause some problems. So, we decide to use USM pool to do allocation to reduce memory overhead.
1 parent cc7e8a1 commit 31ba4df

File tree

4 files changed

+64
-80
lines changed

4 files changed

+64
-80
lines changed

source/loader/layers/sanitizer/asan/asan_interceptor.cpp

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ AsanInterceptor::~AsanInterceptor() {
3636
// We must release these objects before releasing adapters, since
3737
// they may use the adapter in their destructor
3838
for (const auto &[_, DeviceInfo] : m_DeviceMap) {
39-
DeviceInfo->Shadow->Destory();
39+
[[maybe_unused]] auto URes = DeviceInfo->Shadow->Destory();
40+
assert(URes == UR_RESULT_SUCCESS);
4041
}
4142

4243
m_Quarantine = nullptr;
@@ -96,6 +97,10 @@ ur_result_t AsanInterceptor::allocateMemory(ur_context_handle_t Context,
9697

9798
void *Allocated = nullptr;
9899

100+
if (Pool == nullptr) {
101+
Pool = ContextInfo->getUSMPool();
102+
}
103+
99104
if (Type == AllocType::DEVICE_USM) {
100105
UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
101106
Context, Device, Properties, Pool, NeededSize, &Allocated));
@@ -228,16 +233,6 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
228233
ContextInfo->Stats.UpdateUSMRealFreed(
229234
ToFreeAllocInfo->AllocSize, ToFreeAllocInfo->getRedzoneSize());
230235

231-
if (ToFreeAllocInfo->Type == AllocType::HOST_USM) {
232-
for (auto &Device : ContextInfo->DeviceList) {
233-
UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow(
234-
ToFreeAllocInfo));
235-
}
236-
} else {
237-
UR_CALL(getDeviceInfo(ToFreeAllocInfo->Device)
238-
->Shadow->ReleaseShadow(ToFreeAllocInfo));
239-
}
240-
241236
UR_CALL(getContext()->urDdiTable.USM.pfnFree(
242237
Context, (void *)(ToFreeAllocInfo->AllocBegin)));
243238

@@ -436,12 +431,6 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {
436431
auto ProgramInfo = getProgramInfo(Program);
437432
assert(ProgramInfo != nullptr && "unregistered program!");
438433

439-
for (auto AI : ProgramInfo->AllocInfoForGlobals) {
440-
UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI));
441-
m_AllocationMap.erase(AI->AllocBegin);
442-
}
443-
ProgramInfo->AllocInfoForGlobals.clear();
444-
445434
ProgramInfo->InstrumentedKernels.clear();
446435

447436
return UR_RESULT_SUCCESS;
@@ -560,10 +549,6 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
560549
{}});
561550

562551
ContextInfo->insertAllocInfo({Device}, AI);
563-
ProgramInfo->AllocInfoForGlobals.emplace(AI);
564-
565-
std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
566-
m_AllocationMap.emplace(AI->AllocBegin, std::move(AI));
567552
}
568553
}
569554

@@ -887,9 +872,14 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const {
887872
ContextInfo::~ContextInfo() {
888873
Stats.Print(Handle);
889874

890-
[[maybe_unused]] auto Result =
891-
getContext()->urDdiTable.Context.pfnRelease(Handle);
892-
assert(Result == UR_RESULT_SUCCESS);
875+
[[maybe_unused]] ur_result_t URes;
876+
if (USMPool) {
877+
URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool);
878+
assert(URes == UR_RESULT_SUCCESS);
879+
}
880+
881+
URes = getContext()->urDdiTable.Context.pfnRelease(Handle);
882+
assert(URes == UR_RESULT_SUCCESS);
893883

894884
// check memory leaks
895885
if (getAsanInterceptor()->getOptions().DetectLeaks &&
@@ -905,6 +895,22 @@ ContextInfo::~ContextInfo() {
905895
}
906896
}
907897

898+
ur_usm_pool_handle_t ContextInfo::getUSMPool() {
899+
std::call_once(PoolInit, [this]() {
900+
ur_usm_pool_desc_t Desc{UR_STRUCTURE_TYPE_USM_POOL_DESC, nullptr, 0};
901+
auto URes =
902+
getContext()->urDdiTable.USM.pfnPoolCreate(Handle, &Desc, &USMPool);
903+
if (URes != UR_RESULT_SUCCESS &&
904+
URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
905+
getContext()->logger.warning(
906+
"Failed to create USM pool, the memory overhead "
907+
"may increase: {}",
908+
URes);
909+
}
910+
});
911+
return USMPool;
912+
}
913+
908914
AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() {
909915
[[maybe_unused]] ur_result_t Result;
910916
if (Host.LocalArgs) {

source/loader/layers/sanitizer/asan/asan_interceptor.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ struct ProgramInfo {
112112
std::atomic<int32_t> RefCount = 1;
113113

114114
// Program is built only once, so we don't need to lock it
115-
std::unordered_set<std::shared_ptr<AllocInfo>> AllocInfoForGlobals;
116115
std::unordered_set<std::string> InstrumentedKernels;
117116

118117
explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) {
@@ -132,6 +131,10 @@ struct ProgramInfo {
132131

133132
struct ContextInfo {
134133
ur_context_handle_t Handle;
134+
135+
ur_usm_pool_handle_t USMPool{};
136+
std::once_flag PoolInit;
137+
135138
std::atomic<int32_t> RefCount = 1;
136139

137140
std::vector<ur_device_handle_t> DeviceList;
@@ -155,6 +158,8 @@ struct ContextInfo {
155158
AllocInfos.List.emplace_back(AI);
156159
}
157160
}
161+
162+
ur_usm_pool_handle_t getUSMPool();
158163
};
159164

160165
struct AsanRuntimeDataWrapper {

source/loader/layers/sanitizer/asan/asan_shadow.cpp

Lines changed: 27 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,15 @@ ur_result_t ShadowMemoryGPU::Setup() {
108108
// TODO: Protect Bad Zone
109109
auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve(
110110
Context, nullptr, ShadowSize, (void **)&ShadowBegin);
111-
if (Result == UR_RESULT_SUCCESS) {
112-
ShadowEnd = ShadowBegin + ShadowSize;
113-
// Retain the context which reserves shadow memory
114-
getContext()->urDdiTable.Context.pfnRetain(Context);
111+
if (Result != UR_RESULT_SUCCESS) {
112+
getContext()->logger.error(
113+
"Shadow memory reserved failed with size {}: {}",
114+
(void *)ShadowSize, Result);
115+
return Result;
115116
}
117+
ShadowEnd = ShadowBegin + ShadowSize;
118+
// Retain the context which reserves shadow memory
119+
getContext()->urDdiTable.Context.pfnRetain(Context);
116120

117121
// Set shadow memory for null pointer
118122
// For GPU, wu use up to 1 page of shadow memory
@@ -137,6 +141,24 @@ ur_result_t ShadowMemoryGPU::Destory() {
137141
Context, (void *)PrivateShadowOffset));
138142
PrivateShadowOffset = 0;
139143
}
144+
145+
static ur_result_t Result = [this]() {
146+
const size_t PageSize = GetVirtualMemGranularity(Context, Device);
147+
for (auto [MappedPtr, PhysicalMem] : VirtualMemMaps) {
148+
UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
149+
Context, (void *)MappedPtr, PageSize));
150+
UR_CALL(
151+
getContext()->urDdiTable.PhysicalMem.pfnRelease(PhysicalMem));
152+
}
153+
UR_CALL(getContext()->urDdiTable.VirtualMem.pfnFree(
154+
Context, (const void *)ShadowBegin, GetShadowSize()));
155+
UR_CALL(getContext()->urDdiTable.Context.pfnRelease(Context));
156+
return UR_RESULT_SUCCESS;
157+
}();
158+
if (!Result) {
159+
return Result;
160+
}
161+
140162
if (LocalShadowOffset != 0) {
141163
UR_CALL(getContext()->urDdiTable.USM.pfnFree(
142164
Context, (void *)LocalShadowOffset));
@@ -205,19 +227,8 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
205227
return URes;
206228
}
207229

208-
VirtualMemMaps[MappedPtr].first = PhysicalMem;
230+
VirtualMemMaps[MappedPtr] = PhysicalMem;
209231
}
210-
211-
// We don't need to record virtual memory map for null pointer,
212-
// since it doesn't have an alloc info.
213-
if (Ptr == 0) {
214-
continue;
215-
}
216-
217-
auto AllocInfoIt =
218-
getAsanInterceptor()->findAllocInfoByAddress(Ptr);
219-
assert(AllocInfoIt);
220-
VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second);
221232
}
222233
}
223234

@@ -235,35 +246,6 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
235246
return UR_RESULT_SUCCESS;
236247
}
237248

238-
ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
239-
uptr ShadowBegin = MemToShadow(AI->AllocBegin);
240-
uptr ShadowEnd = MemToShadow(AI->AllocBegin + AI->AllocSize);
241-
assert(ShadowBegin <= ShadowEnd);
242-
243-
static const size_t PageSize = GetVirtualMemGranularity(Context, Device);
244-
245-
for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize);
246-
MappedPtr <= ShadowEnd; MappedPtr += PageSize) {
247-
std::scoped_lock<ur_mutex> Guard(VirtualMemMapsMutex);
248-
if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) {
249-
continue;
250-
}
251-
VirtualMemMaps[MappedPtr].second.erase(AI);
252-
if (VirtualMemMaps[MappedPtr].second.empty()) {
253-
UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
254-
Context, (void *)MappedPtr, PageSize));
255-
UR_CALL(getContext()->urDdiTable.PhysicalMem.pfnRelease(
256-
VirtualMemMaps[MappedPtr].first));
257-
getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}",
258-
(void *)MappedPtr,
259-
(void *)(MappedPtr + PageSize - 1));
260-
VirtualMemMaps.erase(MappedPtr);
261-
}
262-
}
263-
264-
return UR_RESULT_SUCCESS;
265-
}
266-
267249
ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
268250
uint32_t NumWG, uptr &Begin,
269251
uptr &End) {

source/loader/layers/sanitizer/asan/asan_shadow.hpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@ struct ShadowMemory {
3535
virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
3636
uptr Size, u8 Value) = 0;
3737

38-
virtual ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo>) {
39-
return UR_RESULT_SUCCESS;
40-
}
41-
4238
virtual size_t GetShadowSize() = 0;
4339

4440
virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue,
@@ -98,8 +94,6 @@ struct ShadowMemoryGPU : public ShadowMemory {
9894
ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
9995
uptr Size, u8 Value) override final;
10096

101-
ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo> AI) override final;
102-
10397
ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG,
10498
uptr &Begin, uptr &End) override final;
10599

@@ -108,10 +102,7 @@ struct ShadowMemoryGPU : public ShadowMemory {
108102

109103
ur_mutex VirtualMemMapsMutex;
110104

111-
std::unordered_map<
112-
uptr, std::pair<ur_physical_mem_handle_t,
113-
std::unordered_set<std::shared_ptr<AllocInfo>>>>
114-
VirtualMemMaps;
105+
std::unordered_map<uptr, ur_physical_mem_handle_t> VirtualMemMaps;
115106

116107
uptr LocalShadowOffset = 0;
117108

0 commit comments

Comments
 (0)