Skip to content

Commit 13a7455

Browse files
authored
[SYCL] Do not build device code for sub-devices (#5240)
Technically sub-devices are the same as their root device, so we can build program for root device only and re-use the binary for sub-devices to avoid "duplicate" builds.
1 parent fbf6e21 commit 13a7455

File tree

13 files changed

+227
-15
lines changed

13 files changed

+227
-15
lines changed

sycl/include/CL/sycl/detail/pi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,8 @@ typedef enum {
305305
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
306306
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
307307
PI_DEVICE_INFO_IMAGE_SRGB = 0x10027,
308+
// Return true if sub-device should do its own program build
309+
PI_DEVICE_INFO_BUILD_ON_SUBDEVICE = 0x10028,
308310
PI_DEVICE_INFO_ATOMIC_64 = 0x10110,
309311
PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111,
310312
PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1490,6 +1490,10 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
14901490
return getInfo(param_value_size, param_value, param_value_size_ret,
14911491
PI_TRUE);
14921492
}
1493+
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
1494+
return getInfo(param_value_size, param_value, param_value_size_ret,
1495+
PI_TRUE);
1496+
}
14931497
case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
14941498
return getInfo(param_value_size, param_value, param_value_size_ret,
14951499
PI_TRUE);

sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
667667
return ReturnValue("");
668668
case PI_DEVICE_INFO_VERSION:
669669
return ReturnValue(Device->VersionStr.c_str());
670+
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: // emulator doesn't support partition
671+
return ReturnValue(pi_bool{true});
670672
case PI_DEVICE_INFO_COMPILER_AVAILABLE:
671673
return ReturnValue(pi_bool{false});
672674
case PI_DEVICE_INFO_LINKER_AVAILABLE:

sycl/plugins/hip/pi_hip.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1409,6 +1409,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
14091409
return getInfo(param_value_size, param_value, param_value_size_ret,
14101410
PI_TRUE);
14111411
}
1412+
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
1413+
return getInfo(param_value_size, param_value, param_value_size_ret,
1414+
PI_TRUE);
1415+
}
14121416
case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
14131417
return getInfo(param_value_size, param_value, param_value_size_ret,
14141418
PI_TRUE);

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,7 +2028,7 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
20282028
return PI_INVALID_VALUE;
20292029
}
20302030

2031-
// Get the cahched PI device created for the L0 device handle.
2031+
// Get the cached PI device created for the L0 device handle.
20322032
// Return NULL if no such PI device found.
20332033
pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {
20342034

@@ -2188,6 +2188,11 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() {
21882188
// Create PI sub-sub-devices with the sub-device for all the ordinals.
21892189
// Each {ordinal, index} points to a specific CCS which constructs
21902190
// a sub-sub-device at this point.
2191+
// FIXME: Level Zero creates multiple PiDevices for a single physical
2192+
// device when sub-device is partitioned into sub-sub-devices.
2193+
// Sub-sub-device is technically a command queue and we should not build
2194+
// program for each command queue. PiDevice is probably not the right
2195+
// abstraction for a Level Zero command queue.
21912196
for (uint32_t J = 0; J < Ordinals.size(); ++J) {
21922197
for (uint32_t K = 0; K < QueueGroupProperties[Ordinals[J]].numQueues;
21932198
++K) {
@@ -2276,8 +2281,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
22762281
}
22772282
}
22782283
case PI_DEVICE_INFO_PARENT_DEVICE:
2279-
// TODO: all Level Zero devices are parent ?
2280-
return ReturnValue(pi_device{0});
2284+
return ReturnValue(Device->RootDevice);
22812285
case PI_DEVICE_INFO_PLATFORM:
22822286
return ReturnValue(Device->Platform);
22832287
case PI_DEVICE_INFO_VENDOR_ID:
@@ -2337,6 +2341,11 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
23372341
}
23382342
case PI_DEVICE_INFO_NAME:
23392343
return ReturnValue(Device->ZeDeviceProperties->name);
2344+
// zeModuleCreate allows using root device module for sub-devices:
2345+
// > The application must only use the module for the device, or its
2346+
// > sub-devices, which was provided during creation.
2347+
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE:
2348+
return ReturnValue(PI_FALSE);
23402349
case PI_DEVICE_INFO_COMPILER_AVAILABLE:
23412350
return ReturnValue(pi_bool{1});
23422351
case PI_DEVICE_INFO_LINKER_AVAILABLE:

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,17 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
203203
std::memcpy(paramValue, &result, sizeof(cl_bool));
204204
return PI_SUCCESS;
205205
}
206+
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
207+
cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
208+
cl_int res = clGetDeviceInfo(cast<cl_device_id>(device), CL_DEVICE_TYPE,
209+
sizeof(cl_device_type), &devType, nullptr);
206210

211+
// FIXME: here we assume that program built for a root GPU device can be
212+
// used on its sub-devices without re-building
213+
cl_bool result = (res == CL_SUCCESS) && (devType == CL_DEVICE_TYPE_GPU);
214+
std::memcpy(paramValue, &result, sizeof(cl_bool));
215+
return PI_SUCCESS;
216+
}
207217
case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D:
208218
// Returns the maximum sizes of a work group for each dimension one
209219
// could use to submit a kernel. There is no such query defined in OpenCL

sycl/source/detail/device_impl.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle,
5353
Plugin.call<PiApiKind::piDeviceGetInfo>(
5454
MDevice, PI_DEVICE_INFO_TYPE, sizeof(RT::PiDeviceType), &MType, nullptr);
5555

56-
RT::PiDevice parent = nullptr;
5756
// TODO catch an exception and put it to list of asynchronous exceptions
5857
Plugin.call<PiApiKind::piDeviceGetInfo>(MDevice, PI_DEVICE_INFO_PARENT_DEVICE,
59-
sizeof(RT::PiDevice), &parent,
58+
sizeof(RT::PiDevice), &MRootDevice,
6059
nullptr);
6160

62-
MIsRootDevice = (nullptr == parent);
6361
if (!InteroperabilityConstructor) {
6462
// TODO catch an exception and put it to list of asynchronous exceptions
6563
// Interoperability Constructor already calls DeviceRetain in

sycl/source/detail/device_impl.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,14 +226,16 @@ class device_impl {
226226

227227
bool isAssertFailSupported() const;
228228

229+
bool isRootDevice() const { return MRootDevice == nullptr; }
230+
229231
std::string getDeviceName() const;
230232

231233
private:
232234
explicit device_impl(pi_native_handle InteropDevice, RT::PiDevice Device,
233235
PlatformImplPtr Platform, const plugin &Plugin);
234236
RT::PiDevice MDevice = 0;
235237
RT::PiDeviceType MType;
236-
bool MIsRootDevice = false;
238+
RT::PiDevice MRootDevice = nullptr;
237239
bool MIsHostDevice;
238240
PlatformImplPtr MPlatform;
239241
bool MIsAssertFailSupported = false;

sycl/source/detail/persistent_device_code_cache.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,13 @@ void PersistentDeviceCodeCache::putItemToDisc(
7878
const SerializedObj &SpecConsts, const std::string &BuildOptionsString,
7979
const RT::PiProgram &NativePrg) {
8080

81+
if (!isImageCached(Img))
82+
return;
83+
8184
std::string DirName =
8285
getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString);
8386

84-
if (!isImageCached(Img) || DirName.empty())
87+
if (DirName.empty())
8588
return;
8689

8790
auto Plugin = detail::getSyclObjImpl(Device)->getPlugin();
@@ -137,10 +140,13 @@ std::vector<std::vector<char>> PersistentDeviceCodeCache::getItemFromDisc(
137140
const device &Device, const RTDeviceBinaryImage &Img,
138141
const SerializedObj &SpecConsts, const std::string &BuildOptionsString) {
139142

143+
if (!isImageCached(Img))
144+
return {};
145+
140146
std::string Path =
141147
getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString);
142148

143-
if (!isImageCached(Img) || Path.empty() || !OSUtil::isPathPresent(Path))
149+
if (Path.empty() || !OSUtil::isPathPresent(Path))
144150
return {};
145151

146152
int i = 0;

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -481,10 +481,29 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
481481
if (Prg)
482482
Prg->stableSerializeSpecConstRegistry(SpecConsts);
483483

484-
auto BuildF = [this, &M, &KSId, &ContextImpl, &DeviceImpl, Prg, &CompileOpts,
484+
// Check if we can optimize program builds for sub-devices by using a program
485+
// built for the root device
486+
DeviceImplPtr RootDevImpl = DeviceImpl;
487+
while (!RootDevImpl->isRootDevice()) {
488+
auto ParentDev = detail::getSyclObjImpl(
489+
RootDevImpl->get_info<info::device::parent_device>());
490+
// Sharing is allowed within a single context only
491+
if (!ContextImpl->hasDevice(ParentDev))
492+
break;
493+
RootDevImpl = ParentDev;
494+
}
495+
496+
pi_bool MustBuildOnSubdevice = PI_TRUE;
497+
ContextImpl->getPlugin().call<PiApiKind::piDeviceGetInfo>(
498+
RootDevImpl->getHandleRef(), PI_DEVICE_INFO_BUILD_ON_SUBDEVICE,
499+
sizeof(pi_bool), &MustBuildOnSubdevice, nullptr);
500+
501+
DeviceImplPtr Dev =
502+
(MustBuildOnSubdevice == PI_TRUE) ? DeviceImpl : RootDevImpl;
503+
auto BuildF = [this, &M, &KSId, &ContextImpl, &Dev, Prg, &CompileOpts,
485504
&LinkOpts, &JITCompilationIsRequired, SpecConsts] {
486505
auto Context = createSyclObjFromImpl<context>(ContextImpl);
487-
auto Device = createSyclObjFromImpl<device>(DeviceImpl);
506+
auto Device = createSyclObjFromImpl<device>(Dev);
488507

489508
const RTDeviceBinaryImage &Img =
490509
getDeviceImage(M, KSId, Context, Device, JITCompilationIsRequired);
@@ -536,7 +555,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
536555
return BuiltProgram.release();
537556
};
538557

539-
const RT::PiDevice PiDevice = DeviceImpl->getHandleRef();
558+
const RT::PiDevice PiDevice = Dev->getHandleRef();
540559

541560
auto BuildResult = getOrBuild<PiProgramT, compile_program_error>(
542561
Cache,

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ class ProgramManager {
108108
SerializedObj SpecConsts);
109109
/// Builds or retrieves from cache a program defining the kernel with given
110110
/// name.
111-
/// \param M idenfies the OS module the kernel comes from (multiple OS modules
112-
/// may have kernels with the same name)
111+
/// \param M identifies the OS module the kernel comes from (multiple OS
112+
/// modules may have kernels with the same name)
113113
/// \param Context the context to build the program with
114114
/// \param Device the device for which the program is built
115115
/// \param KernelName the kernel's name
@@ -153,7 +153,7 @@ class ProgramManager {
153153
/// \param NativePrg the native program, target for spec constant setting; if
154154
/// not null then overrides the native program in Prg
155155
/// \param Img A source of the information about which constants need
156-
/// setting and symboling->integer spec constnant ID mapping. If not
156+
/// setting and symboling->integer spec constant ID mapping. If not
157157
/// null, overrides native program->binary image binding maintained by
158158
/// the program manager.
159159
void flushSpecConstants(const program_impl &Prg,

sycl/unittests/program_manager/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@ add_sycl_unittest(ProgramManagerTests OBJECT
44
BuildLog.cpp
55
EliminatedArgMask.cpp
66
itt_annotations.cpp
7+
SubDevices.cpp
78
)
89

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <CL/sycl/program.hpp>
10+
#include <detail/kernel_bundle_impl.hpp>
11+
12+
#include <helpers/CommonRedefinitions.hpp>
13+
#include <helpers/PiImage.hpp>
14+
#include <helpers/PiMock.hpp>
15+
16+
#include <gtest/gtest.h>
17+
18+
#include <helpers/TestKernel.hpp>
19+
20+
static pi_device rootDevice;
21+
static pi_device piSubDev1 = (pi_device)0x1;
22+
static pi_device piSubDev2 = (pi_device)0x2;
23+
24+
namespace {
25+
pi_result redefinedDeviceGetInfo(pi_device device, pi_device_info param_name,
26+
size_t param_value_size, void *param_value,
27+
size_t *param_value_size_ret) {
28+
if (param_name == PI_DEVICE_INFO_PARTITION_PROPERTIES) {
29+
if (!param_value) {
30+
*param_value_size_ret = 2 * sizeof(pi_device_partition_property);
31+
} else {
32+
((pi_device_partition_property *)param_value)[0] =
33+
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
34+
((pi_device_partition_property *)param_value)[1] =
35+
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
36+
}
37+
}
38+
if (param_name == PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) {
39+
if (!param_value) {
40+
*param_value_size_ret = sizeof(pi_device_affinity_domain);
41+
} else {
42+
((pi_device_affinity_domain *)param_value)[0] =
43+
PI_DEVICE_AFFINITY_DOMAIN_NUMA |
44+
PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE;
45+
}
46+
}
47+
if (param_name == PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) {
48+
((pi_uint32 *)param_value)[0] = 2;
49+
}
50+
if (param_name == PI_DEVICE_INFO_PARENT_DEVICE) {
51+
if (device == piSubDev1 || device == piSubDev2)
52+
((pi_device *)param_value)[0] = rootDevice;
53+
else
54+
((pi_device *)param_value)[0] = nullptr;
55+
}
56+
return PI_SUCCESS;
57+
}
58+
59+
pi_result redefinedDevicePartition(
60+
pi_device Device, const pi_device_partition_property *Properties,
61+
pi_uint32 NumDevices, pi_device *OutDevices, pi_uint32 *OutNumDevices) {
62+
if (OutNumDevices)
63+
*OutNumDevices = 2;
64+
if (OutDevices) {
65+
OutDevices[0] = {};
66+
OutDevices[1] = {};
67+
}
68+
return PI_SUCCESS;
69+
}
70+
71+
pi_result redefinedDeviceRetain(pi_device c) { return PI_SUCCESS; }
72+
73+
pi_result redefinedDeviceRelease(pi_device c) { return PI_SUCCESS; }
74+
75+
pi_result redefinedProgramBuild(
76+
pi_program prog, pi_uint32, const pi_device *, const char *,
77+
void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
78+
static int m = 0;
79+
m++;
80+
// if called more than once return an error
81+
if (m > 1)
82+
return PI_ERROR_UNKNOWN;
83+
84+
return PI_SUCCESS;
85+
}
86+
87+
pi_result redefinedContextCreate(const pi_context_properties *Properties,
88+
pi_uint32 NumDevices, const pi_device *Devices,
89+
void (*PFnNotify)(const char *ErrInfo,
90+
const void *PrivateInfo,
91+
size_t CB, void *UserData),
92+
void *UserData, pi_context *RetContext) {
93+
return PI_SUCCESS;
94+
}
95+
} // anonymous namespace
96+
97+
// Check that program is built once for all sub-devices
98+
// FIXME: mock 3 devices (one root device + two sub-devices) within a single
99+
// context.
100+
TEST(SubDevices, DISABLED_BuildProgramForSubdevices) {
101+
sycl::platform Plt{sycl::default_selector()};
102+
// Host devices do not support sub-devices
103+
if (Plt.is_host() || Plt.get_backend() == sycl::backend::ext_oneapi_cuda ||
104+
Plt.get_backend() == sycl::backend::ext_oneapi_hip) {
105+
std::cerr << "Test is not supported on "
106+
<< Plt.get_info<sycl::info::platform::name>() << ", skipping\n";
107+
GTEST_SKIP(); // test is not supported on selected platform.
108+
}
109+
110+
// Setup Mock APIs
111+
sycl::unittest::PiMock Mock{Plt};
112+
setupDefaultMockAPIs(Mock);
113+
Mock.redefine<sycl::detail::PiApiKind::piDeviceGetInfo>(
114+
redefinedDeviceGetInfo);
115+
Mock.redefine<sycl::detail::PiApiKind::piDevicePartition>(
116+
redefinedDevicePartition);
117+
Mock.redefine<sycl::detail::PiApiKind::piDeviceRetain>(redefinedDeviceRetain);
118+
Mock.redefine<sycl::detail::PiApiKind::piDeviceRelease>(
119+
redefinedDeviceRelease);
120+
Mock.redefine<sycl::detail::PiApiKind::piProgramBuild>(redefinedProgramBuild);
121+
Mock.redefine<sycl::detail::PiApiKind::piContextCreate>(
122+
redefinedContextCreate);
123+
124+
// Create 2 sub-devices and use first platform device as a root device
125+
const sycl::device device = Plt.get_devices()[0];
126+
// Initialize root device
127+
rootDevice = sycl::detail::getSyclObjImpl(device)->getHandleRef();
128+
// Initialize sub-devices
129+
auto PltImpl = sycl::detail::getSyclObjImpl(Plt);
130+
auto subDev1 =
131+
std::make_shared<sycl::detail::device_impl>(piSubDev1, PltImpl);
132+
auto subDev2 =
133+
std::make_shared<sycl::detail::device_impl>(piSubDev2, PltImpl);
134+
sycl::context Ctx{
135+
{device, sycl::detail::createSyclObjFromImpl<sycl::device>(subDev1),
136+
sycl::detail::createSyclObjFromImpl<sycl::device>(subDev2)}};
137+
138+
// Create device binary description structures for getBuiltPIProgram API.
139+
auto devBin = Img.convertToNativeType();
140+
pi_device_binaries_struct devBinStruct{PI_DEVICE_BINARIES_VERSION, 1,
141+
&devBin};
142+
sycl::detail::ProgramManager::getInstance().addImages(&devBinStruct);
143+
144+
// Build program via getBuiltPIProgram API
145+
sycl::detail::ProgramManager::getInstance().getBuiltPIProgram(
146+
sycl::detail::OSUtil::getOSModuleHandle(&devBin),
147+
sycl::detail::getSyclObjImpl(Ctx), subDev1,
148+
sycl::detail::KernelInfo<TestKernel>::getName());
149+
// This call should re-use built binary from the cache. If piProgramBuild is
150+
// called again, the test will fail as second call of redefinedProgramBuild
151+
sycl::detail::ProgramManager::getInstance().getBuiltPIProgram(
152+
sycl::detail::OSUtil::getOSModuleHandle(&devBin),
153+
sycl::detail::getSyclObjImpl(Ctx), subDev2,
154+
sycl::detail::KernelInfo<TestKernel>::getName());
155+
}

0 commit comments

Comments
 (0)