Skip to content

[SYCL] Do not build device code for sub-devices #5240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Feb 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cd9818b
[SYCL][NFC] Call getCacheItemPath only if cache is enabled
bader Dec 28, 2021
04e3869
[SYCL][NFC] Don't include sycl.hpp from headers
bader Dec 28, 2021
ba29bbe
[SYCL][NFC] Factor out empty kernel creation boilerplate
bader Dec 28, 2021
f5b380b
[SYCL] Do not build device code for sub-devices.
bader Dec 23, 2021
5a3587e
Apply clang-format
bader Dec 28, 2021
28b7f80
Fix issues caught by pre-commit CI.
bader Dec 29, 2021
61e09bd
[NFC] Fix a few typos in the comments
bader Dec 29, 2021
d5b93f0
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Jan 20, 2022
a1e483a
Improved build results caching for GPU devices.
bader Jan 21, 2022
7ac48ae
Improve GPU caching.
bader Jan 24, 2022
d0f2861
Revert "Improve GPU caching."
bader Feb 8, 2022
231a1a3
Revert "Improved build results caching for GPU devices."
bader Feb 8, 2022
8f2d9c4
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Feb 8, 2022
d44e27f
Fix formatting.
bader Feb 8, 2022
6e310b0
Add device query for checking if device architecture is homogeneous
bader Feb 14, 2022
ce299cd
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Feb 14, 2022
e6ca4f9
Address code review feedback
bader Feb 15, 2022
bf57926
Added a FIXME comment.
bader Feb 17, 2022
d062d77
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Feb 17, 2022
0e650ea
Update sycl/source/detail/program_manager/program_manager.cpp
bader Feb 17, 2022
d1cc7aa
Move comment to Level Zero plug-in.
bader Feb 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ typedef enum {
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
PI_DEVICE_INFO_IMAGE_SRGB = 0x10027,
// Return true if sub-device should do its own program build
PI_DEVICE_INFO_BUILD_ON_SUBDEVICE = 0x10028,
PI_DEVICE_INFO_ATOMIC_64 = 0x10110,
PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111,
PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
Expand Down
4 changes: 4 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,10 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
}
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
}
case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
Expand Down
2 changes: 2 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
return ReturnValue("");
case PI_DEVICE_INFO_VERSION:
return ReturnValue(Device->VersionStr.c_str());
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: // emulator doesn't support partition
return ReturnValue(pi_bool{true});
case PI_DEVICE_INFO_COMPILER_AVAILABLE:
return ReturnValue(pi_bool{false});
case PI_DEVICE_INFO_LINKER_AVAILABLE:
Expand Down
4 changes: 4 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1409,6 +1409,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
}
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
}
case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
Expand Down
15 changes: 12 additions & 3 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2028,7 +2028,7 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
return PI_INVALID_VALUE;
}

// Get the cahched PI device created for the L0 device handle.
// Get the cached PI device created for the L0 device handle.
// Return NULL if no such PI device found.
pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {

Expand Down Expand Up @@ -2188,6 +2188,11 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() {
// Create PI sub-sub-devices with the sub-device for all the ordinals.
// Each {ordinal, index} points to a specific CCS which constructs
// a sub-sub-device at this point.
// FIXME: Level Zero creates multiple PiDevices for a single physical
// device when sub-device is partitioned into sub-sub-devices.
// Sub-sub-device is technically a command queue and we should not build
// program for each command queue. PiDevice is probably not the right
// abstraction for a Level Zero command queue.
for (uint32_t J = 0; J < Ordinals.size(); ++J) {
for (uint32_t K = 0; K < QueueGroupProperties[Ordinals[J]].numQueues;
++K) {
Expand Down Expand Up @@ -2276,8 +2281,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
}
}
case PI_DEVICE_INFO_PARENT_DEVICE:
// TODO: all Level Zero devices are parent ?
return ReturnValue(pi_device{0});
return ReturnValue(Device->RootDevice);
case PI_DEVICE_INFO_PLATFORM:
return ReturnValue(Device->Platform);
case PI_DEVICE_INFO_VENDOR_ID:
Expand Down Expand Up @@ -2337,6 +2341,11 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
}
case PI_DEVICE_INFO_NAME:
return ReturnValue(Device->ZeDeviceProperties->name);
// zeModuleCreate allows using root device module for sub-devices:
// > The application must only use the module for the device, or its
// > sub-devices, which was provided during creation.
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE:
return ReturnValue(PI_FALSE);
case PI_DEVICE_INFO_COMPILER_AVAILABLE:
return ReturnValue(pi_bool{1});
case PI_DEVICE_INFO_LINKER_AVAILABLE:
Expand Down
10 changes: 10 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,17 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
std::memcpy(paramValue, &result, sizeof(cl_bool));
return PI_SUCCESS;
}
case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
cl_int res = clGetDeviceInfo(cast<cl_device_id>(device), CL_DEVICE_TYPE,
sizeof(cl_device_type), &devType, nullptr);

// FIXME: here we assume that program built for a root GPU device can be
// used on its sub-devices without re-building
cl_bool result = (res == CL_SUCCESS) && (devType == CL_DEVICE_TYPE_GPU);
std::memcpy(paramValue, &result, sizeof(cl_bool));
return PI_SUCCESS;
}
case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D:
// Returns the maximum sizes of a work group for each dimension one
// could use to submit a kernel. There is no such query defined in OpenCL
Expand Down
4 changes: 1 addition & 3 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle,
Plugin.call<PiApiKind::piDeviceGetInfo>(
MDevice, PI_DEVICE_INFO_TYPE, sizeof(RT::PiDeviceType), &MType, nullptr);

RT::PiDevice parent = nullptr;
// TODO catch an exception and put it to list of asynchronous exceptions
Plugin.call<PiApiKind::piDeviceGetInfo>(MDevice, PI_DEVICE_INFO_PARENT_DEVICE,
sizeof(RT::PiDevice), &parent,
sizeof(RT::PiDevice), &MRootDevice,
nullptr);

MIsRootDevice = (nullptr == parent);
if (!InteroperabilityConstructor) {
// TODO catch an exception and put it to list of asynchronous exceptions
// Interoperability Constructor already calls DeviceRetain in
Expand Down
4 changes: 3 additions & 1 deletion sycl/source/detail/device_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,16 @@ class device_impl {

bool isAssertFailSupported() const;

bool isRootDevice() const { return MRootDevice == nullptr; }

std::string getDeviceName() const;

private:
explicit device_impl(pi_native_handle InteropDevice, RT::PiDevice Device,
PlatformImplPtr Platform, const plugin &Plugin);
RT::PiDevice MDevice = 0;
RT::PiDeviceType MType;
bool MIsRootDevice = false;
RT::PiDevice MRootDevice = nullptr;
bool MIsHostDevice;
PlatformImplPtr MPlatform;
bool MIsAssertFailSupported = false;
Expand Down
10 changes: 8 additions & 2 deletions sycl/source/detail/persistent_device_code_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,13 @@ void PersistentDeviceCodeCache::putItemToDisc(
const SerializedObj &SpecConsts, const std::string &BuildOptionsString,
const RT::PiProgram &NativePrg) {

if (!isImageCached(Img))
return;

std::string DirName =
getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString);

if (!isImageCached(Img) || DirName.empty())
if (DirName.empty())
return;

auto Plugin = detail::getSyclObjImpl(Device)->getPlugin();
Expand Down Expand Up @@ -137,10 +140,13 @@ std::vector<std::vector<char>> PersistentDeviceCodeCache::getItemFromDisc(
const device &Device, const RTDeviceBinaryImage &Img,
const SerializedObj &SpecConsts, const std::string &BuildOptionsString) {

if (!isImageCached(Img))
return {};

std::string Path =
getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString);

if (!isImageCached(Img) || Path.empty() || !OSUtil::isPathPresent(Path))
if (Path.empty() || !OSUtil::isPathPresent(Path))
return {};

int i = 0;
Expand Down
25 changes: 22 additions & 3 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,10 +481,29 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
if (Prg)
Prg->stableSerializeSpecConstRegistry(SpecConsts);

auto BuildF = [this, &M, &KSId, &ContextImpl, &DeviceImpl, Prg, &CompileOpts,
// Check if we can optimize program builds for sub-devices by using a program
// built for the root device
DeviceImplPtr RootDevImpl = DeviceImpl;
while (!RootDevImpl->isRootDevice()) {
auto ParentDev = detail::getSyclObjImpl(
RootDevImpl->get_info<info::device::parent_device>());
// Sharing is allowed within a single context only
if (!ContextImpl->hasDevice(ParentDev))
break;
RootDevImpl = ParentDev;
}

pi_bool MustBuildOnSubdevice = PI_TRUE;
ContextImpl->getPlugin().call<PiApiKind::piDeviceGetInfo>(
RootDevImpl->getHandleRef(), PI_DEVICE_INFO_BUILD_ON_SUBDEVICE,
sizeof(pi_bool), &MustBuildOnSubdevice, nullptr);

DeviceImplPtr Dev =
(MustBuildOnSubdevice == PI_TRUE) ? DeviceImpl : RootDevImpl;
auto BuildF = [this, &M, &KSId, &ContextImpl, &Dev, Prg, &CompileOpts,
&LinkOpts, &JITCompilationIsRequired, SpecConsts] {
auto Context = createSyclObjFromImpl<context>(ContextImpl);
auto Device = createSyclObjFromImpl<device>(DeviceImpl);
auto Device = createSyclObjFromImpl<device>(Dev);

const RTDeviceBinaryImage &Img =
getDeviceImage(M, KSId, Context, Device, JITCompilationIsRequired);
Expand Down Expand Up @@ -536,7 +555,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
return BuiltProgram.release();
};

const RT::PiDevice PiDevice = DeviceImpl->getHandleRef();
const RT::PiDevice PiDevice = Dev->getHandleRef();

auto BuildResult = getOrBuild<PiProgramT, compile_program_error>(
Cache,
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/program_manager/program_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ class ProgramManager {
SerializedObj SpecConsts);
/// Builds or retrieves from cache a program defining the kernel with given
/// name.
/// \param M idenfies the OS module the kernel comes from (multiple OS modules
/// may have kernels with the same name)
/// \param M identifies the OS module the kernel comes from (multiple OS
/// modules may have kernels with the same name)
/// \param Context the context to build the program with
/// \param Device the device for which the program is built
/// \param KernelName the kernel's name
Expand Down Expand Up @@ -153,7 +153,7 @@ class ProgramManager {
/// \param NativePrg the native program, target for spec constant setting; if
/// not null then overrides the native program in Prg
/// \param Img A source of the information about which constants need
/// setting and symboling->integer spec constnant ID mapping. If not
/// setting and symboling->integer spec constant ID mapping. If not
/// null, overrides native program->binary image binding maintained by
/// the program manager.
void flushSpecConstants(const program_impl &Prg,
Expand Down
1 change: 1 addition & 0 deletions sycl/unittests/program_manager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ add_sycl_unittest(ProgramManagerTests OBJECT
BuildLog.cpp
EliminatedArgMask.cpp
itt_annotations.cpp
SubDevices.cpp
)

155 changes: 155 additions & 0 deletions sycl/unittests/program_manager/SubDevices.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl/program.hpp>
#include <detail/kernel_bundle_impl.hpp>

#include <helpers/CommonRedefinitions.hpp>
#include <helpers/PiImage.hpp>
#include <helpers/PiMock.hpp>

#include <gtest/gtest.h>

#include <helpers/TestKernel.hpp>

static pi_device rootDevice;
static pi_device piSubDev1 = (pi_device)0x1;
static pi_device piSubDev2 = (pi_device)0x2;

namespace {
pi_result redefinedDeviceGetInfo(pi_device device, pi_device_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
if (param_name == PI_DEVICE_INFO_PARTITION_PROPERTIES) {
if (!param_value) {
*param_value_size_ret = 2 * sizeof(pi_device_partition_property);
} else {
((pi_device_partition_property *)param_value)[0] =
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
((pi_device_partition_property *)param_value)[1] =
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
}
}
if (param_name == PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) {
if (!param_value) {
*param_value_size_ret = sizeof(pi_device_affinity_domain);
} else {
((pi_device_affinity_domain *)param_value)[0] =
PI_DEVICE_AFFINITY_DOMAIN_NUMA |
PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE;
}
}
if (param_name == PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) {
((pi_uint32 *)param_value)[0] = 2;
}
if (param_name == PI_DEVICE_INFO_PARENT_DEVICE) {
if (device == piSubDev1 || device == piSubDev2)
((pi_device *)param_value)[0] = rootDevice;
else
((pi_device *)param_value)[0] = nullptr;
}
return PI_SUCCESS;
}

pi_result redefinedDevicePartition(
pi_device Device, const pi_device_partition_property *Properties,
pi_uint32 NumDevices, pi_device *OutDevices, pi_uint32 *OutNumDevices) {
if (OutNumDevices)
*OutNumDevices = 2;
if (OutDevices) {
OutDevices[0] = {};
OutDevices[1] = {};
}
return PI_SUCCESS;
}

pi_result redefinedDeviceRetain(pi_device c) { return PI_SUCCESS; }

pi_result redefinedDeviceRelease(pi_device c) { return PI_SUCCESS; }

pi_result redefinedProgramBuild(
pi_program prog, pi_uint32, const pi_device *, const char *,
void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
static int m = 0;
m++;
// if called more than once return an error
if (m > 1)
return PI_ERROR_UNKNOWN;

return PI_SUCCESS;
}

pi_result redefinedContextCreate(const pi_context_properties *Properties,
pi_uint32 NumDevices, const pi_device *Devices,
void (*PFnNotify)(const char *ErrInfo,
const void *PrivateInfo,
size_t CB, void *UserData),
void *UserData, pi_context *RetContext) {
return PI_SUCCESS;
}
} // anonymous namespace

// Check that program is built once for all sub-devices
// FIXME: mock 3 devices (one root device + two sub-devices) within a single
// context.
TEST(SubDevices, DISABLED_BuildProgramForSubdevices) {
sycl::platform Plt{sycl::default_selector()};
// Host devices do not support sub-devices
if (Plt.is_host() || Plt.get_backend() == sycl::backend::ext_oneapi_cuda ||
Plt.get_backend() == sycl::backend::ext_oneapi_hip) {
std::cerr << "Test is not supported on "
<< Plt.get_info<sycl::info::platform::name>() << ", skipping\n";
GTEST_SKIP(); // test is not supported on selected platform.
}

// Setup Mock APIs
sycl::unittest::PiMock Mock{Plt};
setupDefaultMockAPIs(Mock);
Mock.redefine<sycl::detail::PiApiKind::piDeviceGetInfo>(
redefinedDeviceGetInfo);
Mock.redefine<sycl::detail::PiApiKind::piDevicePartition>(
redefinedDevicePartition);
Mock.redefine<sycl::detail::PiApiKind::piDeviceRetain>(redefinedDeviceRetain);
Mock.redefine<sycl::detail::PiApiKind::piDeviceRelease>(
redefinedDeviceRelease);
Mock.redefine<sycl::detail::PiApiKind::piProgramBuild>(redefinedProgramBuild);
Mock.redefine<sycl::detail::PiApiKind::piContextCreate>(
redefinedContextCreate);

// Create 2 sub-devices and use first platform device as a root device
const sycl::device device = Plt.get_devices()[0];
// Initialize root device
rootDevice = sycl::detail::getSyclObjImpl(device)->getHandleRef();
// Initialize sub-devices
auto PltImpl = sycl::detail::getSyclObjImpl(Plt);
auto subDev1 =
std::make_shared<sycl::detail::device_impl>(piSubDev1, PltImpl);
auto subDev2 =
std::make_shared<sycl::detail::device_impl>(piSubDev2, PltImpl);
sycl::context Ctx{
{device, sycl::detail::createSyclObjFromImpl<sycl::device>(subDev1),
sycl::detail::createSyclObjFromImpl<sycl::device>(subDev2)}};

// Create device binary description structures for getBuiltPIProgram API.
auto devBin = Img.convertToNativeType();
pi_device_binaries_struct devBinStruct{PI_DEVICE_BINARIES_VERSION, 1,
&devBin};
sycl::detail::ProgramManager::getInstance().addImages(&devBinStruct);

// Build program via getBuiltPIProgram API
sycl::detail::ProgramManager::getInstance().getBuiltPIProgram(
sycl::detail::OSUtil::getOSModuleHandle(&devBin),
sycl::detail::getSyclObjImpl(Ctx), subDev1,
sycl::detail::KernelInfo<TestKernel>::getName());
// This call should re-use built binary from the cache. If piProgramBuild is
// called again, the test will fail as second call of redefinedProgramBuild
sycl::detail::ProgramManager::getInstance().getBuiltPIProgram(
sycl::detail::OSUtil::getOSModuleHandle(&devBin),
sycl::detail::getSyclObjImpl(Ctx), subDev2,
sycl::detail::KernelInfo<TestKernel>::getName());
}