Skip to content

Commit a0de2d7

Browse files
Petr Veselycallumfare
authored andcommitted
[SYCL][CUDA][PI][UR] Fix PR review comments
1 parent 9b3448a commit a0de2d7

File tree

13 files changed

+85
-94
lines changed

13 files changed

+85
-94
lines changed

sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
2121

2222
/// UR context mapping to a CUDA context object.
2323
///
24-
/// There is no direct mapping between a CUDA context and a UR context,
25-
/// main differences described below:
24+
/// There is no direct mapping between a CUDA context and a UR context.
25+
/// The main differences are described below:
2626
///
2727
/// <b> CUDA context vs UR context </b>
2828
///
@@ -32,21 +32,21 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
3232
/// with a given device and control access to said device from the user side.
3333
/// UR API context are objects that are passed to functions, and not bound
3434
/// to threads.
35-
/// The _ur_context object doesn't implement this behavior, only holds the
36-
/// CUDA context data. The RAII object \ref ScopedContext implements the active
37-
/// context behavior.
35+
/// The ur_context_handle_t_ object doesn't implement this behavior. It only
36+
/// holds the CUDA context data. The RAII object \ref ScopedContext implements
37+
/// the active context behavior.
3838
///
3939
/// <b> Primary vs User-defined context </b>
4040
///
4141
/// CUDA has two different types of context, the Primary context,
4242
/// which is usable by all threads on a given process for a given device, and
4343
/// the aforementioned custom contexts.
44-
/// CUDA documentation, and performance analysis, indicates it is recommended
45-
/// to use Primary context whenever possible.
46-
/// Primary context is used as well by the CUDA Runtime API.
44+
/// The CUDA documentation, confirmed with performance analysis, suggest using
45+
/// the Primary context whenever possible.
46+
/// The Primary context is also used by the CUDA Runtime API.
4747
/// For UR applications to interop with CUDA Runtime API, they have to use
4848
/// the primary context - and make that active in the thread.
49-
/// The `_ur_context` object can be constructed with a `kind` parameter
49+
/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
5050
/// that allows to construct a Primary or `user-defined` context, so that
5151
/// the UR object interface is always the same.
5252
///
@@ -56,6 +56,7 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
5656
/// the PI Context can store a number of callback functions that will be
5757
/// called upon destruction of the UR Context.
5858
/// See proposal for details.
59+
/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
5960
///
6061
struct ur_context_handle_t_ {
6162

sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ struct ur_event_handle_t_ {
101101
uint32_t StreamToken);
102102

103103
// This constructor is private to force programmers to use the
104-
// makeWithNative for event introp
104+
// makeWithNative for event interop
105105
ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative);
106106

107107
ur_command_t CommandType; // The type of command associated with event.
@@ -117,33 +117,34 @@ struct ur_event_handle_t_ {
117117
bool IsRecorded; // Signifies wether a native CUDA event has been recorded
118118
// yet.
119119
bool IsStarted; // Signifies wether the operation associated with the
120-
// PI event has started or not
120+
// UR event has started or not
121121

122122
uint32_t StreamToken;
123123
uint32_t EventID; // Queue identifier of the event.
124124

125-
native_type EvEnd; // CUDA event handle. If this _pi_event represents a user
126-
// event, this will be nullptr.
125+
native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents
126+
// a user event, this will be nullptr.
127127

128128
native_type EvStart; // CUDA event handle associated with the start
129129

130130
native_type EvQueued; // CUDA event handle associated with the time
131131
// the command was enqueued
132132

133-
ur_queue_handle_t Queue; // pi_queue associated with the event. If this is a
134-
// user event, this will be nullptr.
133+
ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If
134+
// this is a user event, this will be nullptr.
135135

136136
CUstream Stream; // CUstream associated with the event. If this is a user
137137
// event, this will be uninitialized.
138138

139-
ur_context_handle_t Context; // pi_context associated with the event. If this
140-
// is a native event, this will be the same
141-
// context associated with the queue_ member.
139+
ur_context_handle_t Context; // ur_context_handle_t associated with the event.
140+
// If this is a native event, this will be the
141+
// same context associated with the queue member.
142142
};
143143

144-
// Iterates over the event wait list, returns correct ur_result_t error codes.
145-
// Invokes the callback for the latest event of each queue in the wait list.
146-
// The callback must take a single pi_event argument and return a ur_result_t.
144+
// Iterate over `event_wait_list` and apply the given callback `f` to the
145+
// latest event on each queue therein. The callback must take a single
146+
// ur_event_handle_t argument and return a ur_result_t. If the callback returns
147+
// an error, the iteration terminates and the error is returned.
147148
template <typename Func>
148149
ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
149150
std::size_t NumEventsInWaitList, Func &&F) {
@@ -169,14 +170,13 @@ ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
169170
Event0->getEventID() > Event1->getEventID());
170171
});
171172

172-
bool First = true;
173173
CUstream LastSeenStream = 0;
174-
for (ur_event_handle_t Event : Events) {
175-
if (!Event || (!First && Event->getStream() == LastSeenStream)) {
174+
for (size_t i = 0; i < Events.size(); i++) {
175+
auto Event = Events[i];
176+
if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) {
176177
continue;
177178
}
178179

179-
First = false;
180180
LastSeenStream = Event->getStream();
181181

182182
auto Result = F(Event);

sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
6666
void *pPropValue, size_t *pPropSizeRet) {
6767
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
6868

69-
// Here we want to query about a kernel's cuda blocks!
7069
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
7170

7271
switch (propName) {
@@ -356,6 +355,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
356355
ur_program_handle_t hProgram,
357356
const ur_kernel_native_properties_t *pProperties,
358357
ur_kernel_handle_t *phKernel) {
358+
std::ignore = hNativeKernel;
359+
std::ignore = hContext;
360+
std::ignore = hProgram;
361+
std::ignore = pProperties;
362+
std::ignore = phKernel;
359363
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
360364
}
361365

sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,14 @@
2323
/// invocation. This is not the case of CUFunction objects,
2424
/// which are simply passed together with the arguments on the invocation.
2525
/// The UR Kernel implementation for CUDA stores the list of arguments,
26-
/// argument sizes and offsets to emulate the interface of UR Kernel,
26+
/// argument sizes, and offsets to emulate the interface of UR Kernel,
2727
/// saving the arguments for the later dispatch.
2828
/// Note that in UR API, the Local memory is specified as a size per
2929
/// individual argument, but in CUDA only the total usage of shared
3030
/// memory is required since it is not passed as a parameter.
3131
/// A compiler pass converts the UR API local memory model into the
3232
/// CUDA shared model. This object simply calculates the total of
3333
/// shared memory, and the initial offsets of each parameter.
34-
///
3534
struct ur_kernel_handle_t_ {
3635
using native_type = CUfunction;
3736

@@ -68,7 +67,7 @@ struct ur_kernel_handle_t_ {
6867
Indices.emplace_back(&ImplicitOffsetArgs);
6968
}
7069

71-
/// Adds an argument to the kernel.
70+
/// Add an argument to the kernel.
7271
/// If the argument existed before, it is replaced.
7372
/// Otherwise, it is added.
7473
/// Gaps are filled with empty arguments.
@@ -104,8 +103,9 @@ struct ur_kernel_handle_t_ {
104103

105104
// align the argument
106105
size_t AlignedLocalOffset = LocalOffset;
107-
if (LocalOffset % Alignment != 0) {
108-
AlignedLocalOffset += Alignment - (LocalOffset % Alignment);
106+
size_t Pad = LocalOffset % Alignment;
107+
if (Pad != 0) {
108+
AlignedLocalOffset += Alignment - Pad;
109109
}
110110

111111
addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
@@ -171,7 +171,7 @@ struct ur_kernel_handle_t_ {
171171

172172
const char *getName() const noexcept { return Name.c_str(); }
173173

174-
/// Returns the number of arguments, excluding the implicit global offset.
174+
/// Get the number of kernel arguments, excluding the implicit global offset.
175175
/// Note this only returns the current known number of arguments, not the
176176
/// real one required by the kernel, since this cannot be queried from
177177
/// the CUDA Driver API

sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
/// Creates a UR Memory object using a CUDA memory allocation.
1616
/// Can trigger a manual copy depending on the mode.
17-
/// \TODO Implement USE_HOST_PTR using cuHostRegister
17+
/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789
1818
///
1919
UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
2020
ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
@@ -109,7 +109,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
109109
/// Decreases the reference count of the Mem object.
110110
/// If this is zero, calls the relevant CUDA Free function
111111
/// \return UR_RESULT_SUCCESS unless deallocation error
112-
///
113112
UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
114113
UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
115114

@@ -435,7 +434,6 @@ urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType,
435434
/// Implements a buffer partition in the CUDA backend.
436435
/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
437436
/// as an offset over an existing CUDA allocation.
438-
///
439437
UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
440438
ur_mem_handle_t hBuffer, ur_mem_flags_t flags,
441439
ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion,

sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
/// Keeps tracks of all mapped regions used for Map/Unmap calls.
1919
/// Only one region can be active at the same time per allocation.
2020
struct ur_mem_handle_t_ {
21-
// Context where the memory object is accessibles
21+
// Context where the memory object is accessible
2222
ur_context_handle_t Context;
2323

2424
/// Reference counting of the handler
@@ -31,7 +31,7 @@ struct ur_mem_handle_t_ {
3131
/// A UR Memory object represents either plain memory allocations ("Buffers"
3232
/// in OpenCL) or typed allocations ("Images" in OpenCL).
3333
/// In CUDA their API handlers are different. Whereas "Buffers" are allocated
34-
/// as pointer-like structs, "Images" are stored in Textures or Surfaces
34+
/// as pointer-like structs, "Images" are stored in Textures or Surfaces.
3535
/// This union allows implementation to use either from the same handler.
3636
union MemImpl {
3737
// Handler for plain, pointer-based CUDA allocations
@@ -80,7 +80,6 @@ struct ur_mem_handle_t_ {
8080
/// Returns a pointer to data visible on the host that contains
8181
/// the data on the device associated with this allocation.
8282
/// The offset is used to index into the CUDA allocation.
83-
///
8483
void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept {
8584
assert(MapPtr == nullptr);
8685
MapOffset = Offset;
@@ -152,7 +151,6 @@ struct ur_mem_handle_t_ {
152151
ur_mem_type_t ImageType, void *HostPtr)
153152
: Context{Context}, RefCount{1}, MemType{Type::Surface},
154153
MemFlags{MemFlags} {
155-
// Ignore unused parameter
156154
(void)HostPtr;
157155

158156
Mem.SurfaceMem.Array = Array;
@@ -162,16 +160,13 @@ struct ur_mem_handle_t_ {
162160
}
163161

164162
~ur_mem_handle_t_() {
165-
if (MemType == Type::Buffer) {
166-
if (isSubBuffer()) {
167-
urMemRelease(Mem.BufferMem.Parent);
168-
return;
169-
}
163+
if (isBuffer() && isSubBuffer()) {
164+
urMemRelease(Mem.BufferMem.Parent);
165+
return;
170166
}
171167
urContextRelease(Context);
172168
}
173169

174-
// TODO: Move as many shared funcs up as possible
175170
bool isBuffer() const noexcept { return MemType == Type::Buffer; }
176171

177172
bool isSubBuffer() const noexcept {

sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
5656
///
5757
/// However because multiple devices in a context is not currently supported,
5858
/// place each device in a separate platform.
59-
///
6059
UR_DLLEXPORT ur_result_t UR_APICALL
6160
urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
6261
uint32_t *pNumPlatforms) {
@@ -183,7 +182,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
183182
return UR_RESULT_SUCCESS;
184183
}
185184

186-
// Returns plugin specific backend option.
185+
// Get CUDA plugin specific backend option.
187186
// Current support is only for optimization options.
188187
// Return empty string for cuda.
189188
// TODO: Determine correct string to be passed.

sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
159159
/// CUDA driver API doesn't expose an operation for this.
160160
/// Note: This is currently only being used by the SYCL program class for the
161161
/// has_kernel method, so an alternative would be to move the has_kernel
162-
/// query to PI and use cuModuleGetFunction to check for a kernel.
162+
/// query to UR and use cuModuleGetFunction to check for a kernel.
163163
/// Note: Another alternative is to add kernel names as metadata, like with
164164
/// reqd_work_group_size.
165165
ur_result_t getKernelNames(ur_program_handle_t) {
@@ -169,7 +169,6 @@ ur_result_t getKernelNames(ur_program_handle_t) {
169169
/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
170170
/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
171171
/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
172-
///
173172
UR_APIEXPORT ur_result_t UR_APICALL
174173
urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
175174
size_t length, const ur_program_properties_t *pProperties,
@@ -186,7 +185,6 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
186185
/// CUDA will handle the PTX/CUBIN binaries internally through a call to
187186
/// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent
188187
/// in terms of CUDA adapter. \TODO Implement asynchronous compilation
189-
///
190188
UR_APIEXPORT ur_result_t UR_APICALL
191189
urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
192190
const char *pOptions) {
@@ -196,7 +194,6 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
196194
/// Loads the images from a UR program into a CUmodule that can be
197195
/// used later on to extract functions (kernels).
198196
/// See \ref ur_program_handle_t for implementation details.
199-
///
200197
UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
201198
ur_program_handle_t hProgram,
202199
const char *pOptions) {
@@ -218,7 +215,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
218215
/// Creates a new UR program object that is the outcome of linking all input
219216
/// programs.
220217
/// \TODO Implement linker options, requires mapping of OpenCL to CUDA
221-
///
222218
UR_APIEXPORT ur_result_t UR_APICALL
223219
urProgramLink(ur_context_handle_t hContext, uint32_t count,
224220
const ur_program_handle_t *phPrograms, const char *pOptions,
@@ -390,10 +386,10 @@ urProgramRelease(ur_program_handle_t hProgram) {
390386

391387
/// Gets the native CUDA handle of a UR program object
392388
///
393-
/// \param[in] program The PI program to get the native CUDA object of.
394-
/// \param[out] nativeHandle Set to the native handle of the PI program object.
389+
/// \param[in] program The UR program handle to get the native CUDA object of.
390+
/// \param[out] nativeHandle Set to the native handle of the UR program object.
395391
///
396-
/// \return TBD
392+
/// \return ur_result_t
397393
UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
398394
ur_program_handle_t program, ur_native_handle_t *nativeHandle) {
399395
UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);

sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ CUstream ur_queue_handle_t_::getNextTransferStream() {
115115
/// Valid properties
116116
/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
117117
/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
118-
///
119118
UR_APIEXPORT ur_result_t UR_APICALL
120119
urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
121120
const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
@@ -294,7 +293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
294293

295294
UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
296295

297-
switch (uint32_t{propName}) {
296+
switch (propName) {
298297
case UR_QUEUE_INFO_CONTEXT:
299298
return ReturnValue(hQueue->Context);
300299
case UR_QUEUE_INFO_DEVICE:
@@ -324,7 +323,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
324323
}
325324
}
326325
default:
327-
break;
326+
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
328327
}
329328

330329
return UR_RESULT_ERROR_INVALID_ENUMERATION;

0 commit comments

Comments
 (0)