Skip to content

Commit fbb1fb0

Browse files
authored
[SYCL] Take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY error code in Program Manager (#15335)
Currently if program manager faces one of the errors - UR_RESULT_ERROR_OUT_OF_RESOURCES or UR_RESULT_ERROR_OUT_OF_HOST_MEMORY - during the program building/linking then it will clear the cache and make another attempt. This PR adds the following changes: * Additionally take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY error which is also possible in addition to aforementioned error codes. * Parameterize the existing unit test by error code (which allows to avoid excessive code duplication) and add UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY value to testing.
1 parent 114236f commit fbb1fb0

File tree

3 files changed

+22
-127
lines changed

3 files changed

+22
-127
lines changed

sycl/source/detail/kernel_program_cache.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,8 @@ class KernelProgramCache {
329329
BuildResult->Error.Code = detail::get_ur_error(Ex);
330330
if (Ex.code() == errc::memory_allocation ||
331331
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
332-
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
332+
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
333+
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
333334
reset();
334335
BuildResult->updateAndNotify(BuildState::BS_Initial);
335336
continue;

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,7 +1605,8 @@ ProgramManager::ProgramPtr ProgramManager::build(
16051605
};
16061606
ur_result_t Error = doLink();
16071607
if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
1608-
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
1608+
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
1609+
Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
16091610
Context->getKernelProgramCache().reset();
16101611
Error = doLink();
16111612
}
@@ -2427,7 +2428,8 @@ ProgramManager::link(const device_image_plain &DeviceImage,
24272428
};
24282429
ur_result_t Error = doLink();
24292430
if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
2430-
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
2431+
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
2432+
Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
24312433
ContextImpl->getKernelProgramCache().reset();
24322434
Error = doLink();
24332435
}

sycl/unittests/kernel-and-program/OutOfResources.cpp

Lines changed: 16 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,24 @@ static sycl::unittest::UrImageArray<2> ImgArray{Img};
3535

3636
static int nProgramCreate = 0;
3737
static volatile bool outOfResourcesToggle = false;
38-
static volatile bool outOfHostMemoryToggle = false;
38+
static volatile ur_result_t ErrorCode = UR_RESULT_SUCCESS;
3939

4040
static ur_result_t redefinedProgramCreateWithIL(void *) {
4141
++nProgramCreate;
4242
if (outOfResourcesToggle) {
4343
outOfResourcesToggle = false;
44-
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
44+
return ErrorCode;
4545
}
4646
return UR_RESULT_SUCCESS;
4747
}
4848

49-
static ur_result_t redefinedProgramCreateWithILOutOfHostMemory(void *) {
50-
++nProgramCreate;
51-
if (outOfHostMemoryToggle) {
52-
outOfHostMemoryToggle = false;
53-
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
54-
}
55-
return UR_RESULT_SUCCESS;
56-
}
49+
// Parameterized test fixture
50+
class OutOfResourcesTestSuite : public ::testing::TestWithParam<ur_result_t> {};
5751

58-
TEST(OutOfResourcesTest, urProgramCreate) {
52+
TEST_P(OutOfResourcesTestSuite, urProgramCreate) {
53+
nProgramCreate = 0;
5954
sycl::unittest::UrMock<> Mock;
55+
ErrorCode = GetParam();
6056
mock::getCallbacks().set_before_callback("urProgramCreateWithIL",
6157
&redefinedProgramCreateWithIL);
6258

@@ -116,92 +112,21 @@ TEST(OutOfResourcesTest, urProgramCreate) {
116112
}
117113
}
118114

119-
TEST(OutOfHostMemoryTest, urProgramCreate) {
120-
// Reset to zero.
121-
nProgramCreate = 0;
122-
123-
sycl::unittest::UrMock<> Mock;
124-
mock::getCallbacks().set_before_callback(
125-
"urProgramCreateWithIL", &redefinedProgramCreateWithILOutOfHostMemory);
126-
127-
sycl::platform Plt{sycl::platform()};
128-
sycl::context Ctx{Plt};
129-
auto CtxImpl = detail::getSyclObjImpl(Ctx);
130-
queue q(Ctx, default_selector_v);
131-
132-
int runningTotal = 0;
133-
// Cache is empty, so one urProgramCreateWithIL call.
134-
q.single_task<class OutOfResourcesKernel1>([] {});
135-
EXPECT_EQ(nProgramCreate, runningTotal += 1);
136-
137-
// Now, we make the next urProgramCreateWithIL call fail with
138-
// UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch
139-
// this, clear the cache, and retry the urProgramCreateWithIL.
140-
outOfHostMemoryToggle = true;
141-
q.single_task<class OutOfResourcesKernel2>([] {});
142-
EXPECT_FALSE(outOfHostMemoryToggle);
143-
EXPECT_EQ(nProgramCreate, runningTotal += 2);
144-
{
145-
detail::KernelProgramCache::ProgramCache &Cache =
146-
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
147-
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
148-
}
149-
150-
// The next urProgramCreateWithIL call will fail with
151-
// UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in the
152-
// cache, so we expect no new urProgramCreateWithIL calls.
153-
outOfHostMemoryToggle = true;
154-
q.single_task<class OutOfResourcesKernel2>([] {});
155-
EXPECT_TRUE(outOfHostMemoryToggle);
156-
EXPECT_EQ(nProgramCreate, runningTotal);
157-
158-
// OutOfResourcesKernel1 is not in the cache, so we have to
159-
// build it. From what we set before, this call will fail,
160-
// the cache will clear out, and will try again.
161-
q.single_task<class OutOfResourcesKernel1>([] {});
162-
EXPECT_FALSE(outOfHostMemoryToggle);
163-
EXPECT_EQ(nProgramCreate, runningTotal += 2);
164-
{
165-
detail::KernelProgramCache::ProgramCache &Cache =
166-
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
167-
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
168-
}
169-
170-
// Finally, OutOfResourcesKernel1 will be in the cache, but
171-
// OutOfResourceKenel2 will not, so one more urProgramCreateWithIL.
172-
// Toggle is not set, so this should succeed.
173-
q.single_task<class OutOfResourcesKernel1>([] {});
174-
q.single_task<class OutOfResourcesKernel2>([] {});
175-
EXPECT_EQ(nProgramCreate, runningTotal += 1);
176-
{
177-
detail::KernelProgramCache::ProgramCache &Cache =
178-
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
179-
EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache";
180-
}
181-
}
182-
183115
static int nProgramLink = 0;
184116

185117
static ur_result_t redefinedProgramLink(void *) {
186118
++nProgramLink;
187119
if (outOfResourcesToggle) {
188120
outOfResourcesToggle = false;
189-
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
190-
}
191-
return UR_RESULT_SUCCESS;
192-
}
193-
194-
static ur_result_t redefinedProgramLinkOutOfHostMemory(void *) {
195-
++nProgramLink;
196-
if (outOfHostMemoryToggle) {
197-
outOfHostMemoryToggle = false;
198-
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
121+
return ErrorCode;
199122
}
200123
return UR_RESULT_SUCCESS;
201124
}
202125

203-
TEST(OutOfResourcesTest, urProgramLink) {
126+
TEST_P(OutOfResourcesTestSuite, urProgramLink) {
127+
nProgramLink = 0;
204128
sycl::unittest::UrMock<> Mock;
129+
ErrorCode = GetParam();
205130
mock::getCallbacks().set_before_callback("urProgramLinkExp",
206131
&redefinedProgramLink);
207132

@@ -236,41 +161,8 @@ TEST(OutOfResourcesTest, urProgramLink) {
236161
}
237162
}
238163

239-
TEST(OutOfHostMemoryTest, urProgramLink) {
240-
// Reset to zero.
241-
nProgramLink = 0;
242-
243-
sycl::unittest::UrMock<> Mock;
244-
mock::getCallbacks().set_before_callback(
245-
"urProgramLinkExp", &redefinedProgramLinkOutOfHostMemory);
246-
247-
sycl::platform Plt{sycl::platform()};
248-
sycl::context Ctx{Plt};
249-
auto CtxImpl = detail::getSyclObjImpl(Ctx);
250-
queue q(Ctx, default_selector_v);
251-
// Put some programs in the cache
252-
q.single_task<class OutOfResourcesKernel1>([] {});
253-
q.single_task<class OutOfResourcesKernel2>([] {});
254-
{
255-
detail::KernelProgramCache::ProgramCache &Cache =
256-
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
257-
EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache";
258-
}
259-
260-
auto b1 = sycl::get_kernel_bundle<OutOfResourcesKernel1,
261-
sycl::bundle_state::object>(Ctx);
262-
auto b2 = sycl::get_kernel_bundle<OutOfResourcesKernel2,
263-
sycl::bundle_state::object>(Ctx);
264-
outOfHostMemoryToggle = true;
265-
EXPECT_EQ(nProgramLink, 0);
266-
auto b3 = sycl::link({b1, b2});
267-
EXPECT_FALSE(outOfHostMemoryToggle);
268-
// one restart due to out of resources, one link per each of b1 and b2.
269-
EXPECT_EQ(nProgramLink, 3);
270-
// no programs should be in the cache due to out of resources.
271-
{
272-
detail::KernelProgramCache::ProgramCache &Cache =
273-
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
274-
EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
275-
}
276-
}
164+
INSTANTIATE_TEST_SUITE_P(
165+
OutOfResourcesParameterizedRun, OutOfResourcesTestSuite,
166+
::testing::Values(UR_RESULT_ERROR_OUT_OF_RESOURCES,
167+
UR_RESULT_ERROR_OUT_OF_HOST_MEMORY,
168+
UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY));

0 commit comments

Comments
 (0)