Skip to content

Commit c461779

Browse files
[SYCL] Add device_global initialization and PI APIs (#7797)
This commit adds initialization of device_global without device_image_scope. This is done as follows: 1. Allocate USM memory for the device_global. 2. Fill the USM memory with zeros. Store this event as a dependency for related kernels. 3. When enqueuing a kernel using one or more device_globals, get the initialization events for each of them (unless they have finished) and add them as dependencies for the kernel launch. Additionally, to write the USM pointer to the device_global, PI has two new functions: piextEnqueueDeviceVariableWrite and piextEnqueueDeviceVariableRead. These allow for writing to and reading from device_globals in a given program. These are implemented for OpenCL, L0, and CUDA. For CUDA sycl-post-link will generate a mapping from the device_global's unique ID and the name in the binary. Design doc: [sycl/doc/design/DeviceGlobal.md](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/DeviceGlobal.md) --------- Signed-off-by: Larsen, Steffen <[email protected]>
1 parent a464e30 commit c461779

25 files changed

+1166
-103
lines changed

sycl/include/sycl/detail/pi.def

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,4 +147,9 @@ _PI_API(piextUSMEnqueueMemset2D)
147147
_PI_API(piextUSMEnqueueMemcpy2D)
148148

149149
_PI_API(piGetDeviceAndHostTimer)
150+
151+
// Device global variable
152+
_PI_API(piextEnqueueDeviceGlobalVariableWrite)
153+
_PI_API(piextEnqueueDeviceGlobalVariableRead)
154+
150155
#undef _PI_API

sycl/include/sycl/detail/pi.h

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,11 @@
7575
// PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT context info query
7676
// descriptors.
7777
// 12.22 Add piGetDeviceAndHostTimer to query device wall-clock timestamp
78+
// 12.23 Added new piextEnqueueDeviceGlobalVariableWrite and
79+
// piextEnqueueDeviceGlobalVariableRead functions.
7880

7981
#define _PI_H_VERSION_MAJOR 12
80-
#define _PI_H_VERSION_MINOR 22
82+
#define _PI_H_VERSION_MINOR 23
8183

8284
#define _PI_STRING_HELPER(a) #a
8385
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -435,7 +437,9 @@ typedef enum {
435437
PI_COMMAND_TYPE_SVM_MEMCPY = 0x120A,
436438
PI_COMMAND_TYPE_SVM_MEMFILL = 0x120B,
437439
PI_COMMAND_TYPE_SVM_MAP = 0x120C,
438-
PI_COMMAND_TYPE_SVM_UNMAP = 0x120D
440+
PI_COMMAND_TYPE_SVM_UNMAP = 0x120D,
441+
PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ = 0x418E,
442+
PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE = 0x418F
439443
} _pi_command_type;
440444

441445
typedef enum {
@@ -808,6 +812,7 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4;
808812
/// must appear after the kernel name.
809813
#define __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \
810814
"@reqd_work_group_size"
815+
#define __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
811816

812817
/// This struct is a record of the device binary information. If the Kind field
813818
/// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec
@@ -1876,6 +1881,51 @@ __SYCL_EXPORT pi_result piextUSMEnqueueMemcpy2D(
18761881
pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist,
18771882
pi_event *event);
18781883

1884+
///
1885+
/// Device global variable
1886+
///
1887+
1888+
/// API for writing data from host to a device global variable.
1889+
///
1890+
/// \param queue is the queue
1891+
/// \param program is the program containing the device global variable
1892+
/// \param blocking_write is true if the write should block
1893+
/// \param name is the unique identifier for the device global variable
1894+
/// \param count is the number of bytes to copy
1895+
/// \param offset is the byte offset into the device global variable to start
1896+
/// copying
1897+
/// \param src is a pointer to where the data must be copied from
1898+
/// \param num_events_in_wait_list is a number of events in the wait list
1899+
/// \param event_wait_list is the wait list
1900+
/// \param event is the resulting event
1901+
pi_result piextEnqueueDeviceGlobalVariableWrite(
1902+
pi_queue queue, pi_program program, const char *name,
1903+
pi_bool blocking_write, size_t count, size_t offset, const void *src,
1904+
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
1905+
pi_event *event);
1906+
1907+
/// API reading data from a device global variable to host.
1908+
///
1909+
/// \param queue is the queue
1910+
/// \param program is the program containing the device global variable
1911+
/// \param blocking_read is true if the read should block
1912+
/// \param name is the unique identifier for the device global variable
1913+
/// \param count is the number of bytes to copy
1914+
/// \param offset is the byte offset into the device global variable to start
1915+
/// copying
1916+
/// \param dst is a pointer to where the data must be copied to
1917+
/// \param num_events_in_wait_list is a number of events in the wait list
1918+
/// \param event_wait_list is the wait list
1919+
/// \param event is the resulting event
1920+
pi_result piextEnqueueDeviceGlobalVariableRead(
1921+
pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
1922+
size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
1923+
const pi_event *event_wait_list, pi_event *event);
1924+
1925+
///
1926+
/// Plugin
1927+
///
1928+
18791929
/// API to get Plugin internal data, opaque to SYCL RT. Some devices whose
18801930
/// device code is compiled by the host compiler (e.g. CPU emulators) may use it
18811931
/// to access some device code functionality implemented in/behind the plugin.

sycl/include/sycl/ext/oneapi/device_global/device_global.hpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,6 @@ class
135135
static_assert(is_property_list<property_list_t>::value,
136136
"Property list is invalid.");
137137

138-
// TODO: Remove when support has been added for device_global without the
139-
// device_image_scope property.
140-
static_assert(
141-
property_list_t::template has_property<device_image_scope_key>(),
142-
"device_global without the device_image_scope property is currently "
143-
"unavailable.");
144-
145138
device_global() = default;
146139

147140
device_global(const device_global &) = delete;

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 101 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -670,31 +670,26 @@ _pi_program::_pi_program(pi_context ctxt)
670670

671671
_pi_program::~_pi_program() { cuda_piContextRelease(context_); }
672672

673-
bool get_kernel_metadata(std::string metadataName, const char *tag,
674-
std::string &kernelName) {
675-
const size_t tagLength = strlen(tag);
676-
const size_t metadataNameLength = metadataName.length();
677-
if (metadataNameLength >= tagLength &&
678-
metadataName.compare(metadataNameLength - tagLength, tagLength, tag) ==
679-
0) {
680-
kernelName = metadataName.substr(0, metadataNameLength - tagLength);
681-
return true;
682-
}
683-
return false;
673+
std::pair<std::string, std::string>
674+
splitMetadataName(const std::string &metadataName) {
675+
size_t splitPos = metadataName.rfind('@');
676+
if (splitPos == std::string::npos)
677+
return std::make_pair(metadataName, std::string{});
678+
return std::make_pair(metadataName.substr(0, splitPos),
679+
metadataName.substr(splitPos, metadataName.length()));
684680
}
685681

686682
pi_result _pi_program::set_metadata(const pi_device_binary_property *metadata,
687683
size_t length) {
688684
for (size_t i = 0; i < length; ++i) {
689685
const pi_device_binary_property metadataElement = metadata[i];
690686
std::string metadataElementName{metadataElement->Name};
691-
std::string kernelName;
692687

693-
// If metadata is reqd_work_group_size record it for the corresponding
694-
// kernel name.
695-
if (get_kernel_metadata(metadataElementName,
696-
__SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE,
697-
kernelName)) {
688+
auto [prefix, tag] = splitMetadataName(metadataElementName);
689+
690+
if (tag == __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
691+
// If metadata is reqd_work_group_size, record it for the corresponding
692+
// kernel name.
698693
size_t MDElemsSize = metadataElement->ValSize - sizeof(std::uint64_t);
699694

700695
// Expect between 1 and 3 32-bit integer values.
@@ -709,9 +704,16 @@ pi_result _pi_program::set_metadata(const pi_device_binary_property *metadata,
709704
// Read values and pad with 1's for values not present.
710705
std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
711706
std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
712-
kernelReqdWorkGroupSizeMD_[kernelName] =
707+
kernelReqdWorkGroupSizeMD_[prefix] =
713708
std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
714709
reqdWorkGroupElements[2]);
710+
} else if (tag == __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
711+
const char *metadataValPtr =
712+
reinterpret_cast<const char *>(metadataElement->ValAddr) +
713+
sizeof(std::uint64_t);
714+
const char *metadataValPtrEnd =
715+
metadataValPtr + metadataElement->ValSize - sizeof(std::uint64_t);
716+
globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
715717
}
716718
}
717719
return PI_SUCCESS;
@@ -5505,6 +5507,82 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
55055507
return result;
55065508
}
55075509

5510+
pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
5511+
pi_queue queue, pi_program program, const char *name,
5512+
pi_bool blocking_write, size_t count, size_t offset, const void *src,
5513+
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
5514+
pi_event *event) {
5515+
assert(queue != nullptr);
5516+
assert(program != nullptr);
5517+
5518+
if (name == nullptr || src == nullptr)
5519+
return PI_ERROR_INVALID_VALUE;
5520+
5521+
// Since CUDA requires a the global variable to be referenced by name, we use
5522+
// metadata to find the correct name to access it by.
5523+
auto device_global_name_it = program->globalIDMD_.find(name);
5524+
if (device_global_name_it == program->globalIDMD_.end())
5525+
return PI_ERROR_INVALID_VALUE;
5526+
std::string device_global_name = device_global_name_it->second;
5527+
5528+
pi_result result = PI_SUCCESS;
5529+
try {
5530+
CUdeviceptr device_global = 0;
5531+
size_t device_global_size = 0;
5532+
result = PI_CHECK_ERROR(
5533+
cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
5534+
device_global_name.c_str()));
5535+
5536+
if (offset + count > device_global_size)
5537+
return PI_ERROR_INVALID_VALUE;
5538+
5539+
return cuda_piextUSMEnqueueMemcpy(
5540+
queue, blocking_write, reinterpret_cast<void *>(device_global + offset),
5541+
src, count, num_events_in_wait_list, event_wait_list, event);
5542+
} catch (pi_result error) {
5543+
result = error;
5544+
}
5545+
return result;
5546+
}
5547+
5548+
pi_result cuda_piextEnqueueDeviceGlobalVariableRead(
5549+
pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
5550+
size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
5551+
const pi_event *event_wait_list, pi_event *event) {
5552+
assert(queue != nullptr);
5553+
assert(program != nullptr);
5554+
5555+
if (name == nullptr || dst == nullptr)
5556+
return PI_ERROR_INVALID_VALUE;
5557+
5558+
// Since CUDA requires a the global variable to be referenced by name, we use
5559+
// metadata to find the correct name to access it by.
5560+
auto device_global_name_it = program->globalIDMD_.find(name);
5561+
if (device_global_name_it == program->globalIDMD_.end())
5562+
return PI_ERROR_INVALID_VALUE;
5563+
std::string device_global_name = device_global_name_it->second;
5564+
5565+
pi_result result = PI_SUCCESS;
5566+
try {
5567+
CUdeviceptr device_global = 0;
5568+
size_t device_global_size = 0;
5569+
result = PI_CHECK_ERROR(
5570+
cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
5571+
device_global_name.c_str()));
5572+
5573+
if (offset + count > device_global_size)
5574+
return PI_ERROR_INVALID_VALUE;
5575+
5576+
return cuda_piextUSMEnqueueMemcpy(
5577+
queue, blocking_read, dst,
5578+
reinterpret_cast<const void *>(device_global + offset), count,
5579+
num_events_in_wait_list, event_wait_list, event);
5580+
} catch (pi_result error) {
5581+
result = error;
5582+
}
5583+
return result;
5584+
}
5585+
55085586
// This API is called by Sycl RT to notify the end of the plugin lifetime.
55095587
// TODO: add a global variable lifetime management code here (see
55105588
// pi_level_zero.cpp for reference) Currently this is just a NOOP.
@@ -5685,6 +5763,11 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
56855763
_PI_CL(piextUSMEnqueueMemset2D, cuda_piextUSMEnqueueMemset2D)
56865764
_PI_CL(piextUSMEnqueueMemcpy2D, cuda_piextUSMEnqueueMemcpy2D)
56875765
_PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo)
5766+
// Device global variable
5767+
_PI_CL(piextEnqueueDeviceGlobalVariableWrite,
5768+
cuda_piextEnqueueDeviceGlobalVariableWrite)
5769+
_PI_CL(piextEnqueueDeviceGlobalVariableRead,
5770+
cuda_piextEnqueueDeviceGlobalVariableRead)
56885771

56895772
_PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
56905773
_PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,7 @@ struct _pi_program {
773773
// Metadata
774774
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
775775
kernelReqdWorkGroupSizeMD_;
776+
std::unordered_map<std::string, std::string> globalIDMD_;
776777

777778
constexpr static size_t MAX_LOG_SIZE = 8192u;
778779

sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2029,6 +2029,20 @@ pi_result piextUSMEnqueuePrefetch(pi_queue, const void *, size_t,
20292029
DIE_NO_IMPLEMENTATION;
20302030
}
20312031

2032+
pi_result piextEnqueueDeviceGlobalVariableWrite(pi_queue, pi_program,
2033+
const char *, pi_bool, size_t,
2034+
size_t, const void *, pi_uint32,
2035+
const pi_event *, pi_event *) {
2036+
DIE_NO_IMPLEMENTATION;
2037+
}
2038+
2039+
pi_result piextEnqueueDeviceGlobalVariableRead(pi_queue, pi_program,
2040+
const char *, pi_bool, size_t,
2041+
size_t, void *, pi_uint32,
2042+
const pi_event *, pi_event *) {
2043+
DIE_NO_IMPLEMENTATION;
2044+
}
2045+
20322046
pi_result piextPluginGetOpaqueData(void *, void **OpaqueDataReturn) {
20332047
*OpaqueDataReturn = reinterpret_cast<void *>(PiESimdDeviceAccess);
20342048
return PI_SUCCESS;

sycl/plugins/hip/pi_hip.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5228,6 +5228,47 @@ pi_result hip_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
52285228
return result;
52295229
}
52305230

5231+
pi_result hip_piextEnqueueDeviceGlobalVariableWrite(
5232+
pi_queue queue, pi_program program, const char *name,
5233+
pi_bool blocking_write, size_t count, size_t offset, const void *src,
5234+
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
5235+
pi_event *event) {
5236+
(void)queue;
5237+
(void)program;
5238+
(void)name;
5239+
(void)blocking_write;
5240+
(void)count;
5241+
(void)offset;
5242+
(void)src;
5243+
(void)num_events_in_wait_list;
5244+
(void)event_wait_list;
5245+
(void)event;
5246+
5247+
sycl::detail::pi::die(
5248+
"hip_piextEnqueueDeviceGlobalVariableWrite not implemented");
5249+
return {};
5250+
}
5251+
5252+
pi_result hip_piextEnqueueDeviceGlobalVariableRead(
5253+
pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
5254+
size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
5255+
const pi_event *event_wait_list, pi_event *event) {
5256+
(void)queue;
5257+
(void)program;
5258+
(void)name;
5259+
(void)blocking_read;
5260+
(void)count;
5261+
(void)offset;
5262+
(void)dst;
5263+
(void)num_events_in_wait_list;
5264+
(void)event_wait_list;
5265+
(void)event;
5266+
5267+
sycl::detail::pi::die(
5268+
"hip_piextEnqueueDeviceGlobalVariableRead not implemented");
5269+
return {};
5270+
}
5271+
52315272
// This API is called by Sycl RT to notify the end of the plugin lifetime.
52325273
// TODO: add a global variable lifetime management code here (see
52335274
// pi_level_zero.cpp for reference) Currently this is just a NOOP.
@@ -5401,6 +5442,11 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
54015442
_PI_CL(piextUSMEnqueueFill2D, hip_piextUSMEnqueueFill2D)
54025443
_PI_CL(piextUSMEnqueueMemset2D, hip_piextUSMEnqueueMemset2D)
54035444
_PI_CL(piextUSMGetMemAllocInfo, hip_piextUSMGetMemAllocInfo)
5445+
// Device global variable
5446+
_PI_CL(piextEnqueueDeviceGlobalVariableWrite,
5447+
hip_piextEnqueueDeviceGlobalVariableWrite)
5448+
_PI_CL(piextEnqueueDeviceGlobalVariableRead,
5449+
hip_piextEnqueueDeviceGlobalVariableRead)
54045450

54055451
_PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj)
54065452
_PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler)

0 commit comments

Comments
 (0)