Skip to content

Commit 1e7185a

Browse files
authored
[SYCL][CUDA] PI API Sampler Support for CUDA (#1993)
This PR introduces an implementation of the pi_sampler API for the PI CUDA backend. It also introduces piextKernelSetArgSampler function which has been defined in the CUDA, OpenCL and Level0 backends.
1 parent a03f315 commit 1e7185a

File tree

9 files changed

+189
-15
lines changed

9 files changed

+189
-15
lines changed

sycl/include/CL/sycl/detail/pi.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,5 +125,6 @@ _PI_API(piextUSMEnqueueMemAdvise)
125125
_PI_API(piextUSMGetMemAllocInfo)
126126

127127
_PI_API(piextKernelSetArgMemObj)
128+
_PI_API(piextKernelSetArgSampler)
128129

129130
#undef _PI_API

sycl/include/CL/sycl/detail/pi.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,10 +1332,22 @@ __SYCL_EXPORT pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
13321332
const pi_event *event_wait_list,
13331333
pi_event *event);
13341334

1335+
// Extension to allow backends to process a PI memory object before adding it
1336+
// as an argument for a kernel.
1337+
// Note: This is needed by the CUDA backend to extract the device pointer to
1338+
// the memory as the kernels uses it rather than the PI object itself.
13351339
__SYCL_EXPORT pi_result piextKernelSetArgMemObj(pi_kernel kernel,
13361340
pi_uint32 arg_index,
13371341
const pi_mem *arg_value);
13381342

1343+
// Extension to allow backends to process a PI sampler object before adding it
1344+
// as an argument for a kernel.
1345+
// Note: This is needed by the CUDA backend to extract the properties of the
1346+
// sampler as the kernels uses it rather than the PI object itself.
1347+
__SYCL_EXPORT pi_result piextKernelSetArgSampler(pi_kernel kernel,
1348+
pi_uint32 arg_index,
1349+
const pi_sampler *arg_value);
1350+
13391351
///
13401352
// USM
13411353
///

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 136 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2210,6 +2210,22 @@ pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
22102210
return retErr;
22112211
}
22122212

2213+
pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
2214+
const pi_sampler *arg_value) {
2215+
2216+
assert(kernel != nullptr);
2217+
assert(arg_value != nullptr);
2218+
2219+
pi_result retErr = PI_SUCCESS;
2220+
try {
2221+
pi_uint32 samplerProps = (*arg_value)->props_;
2222+
kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
2223+
} catch (pi_result err) {
2224+
retErr = err;
2225+
}
2226+
return retErr;
2227+
}
2228+
22132229
pi_result cuda_piEnqueueKernelLaunch(
22142230
pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
22152231
const size_t *global_work_offset, const size_t *global_work_size,
@@ -2989,32 +3005,139 @@ pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle,
29893005
return {};
29903006
}
29913007

2992-
/// \TODO Not implemented in CUDA, need untie from OpenCL
3008+
/// Creates a PI sampler object
3009+
///
3010+
/// \param[in] context The context the sampler is created for.
3011+
/// \param[in] sampler_properties The properties for the sampler.
3012+
/// \param[out] result_sampler Set to the resulting sampler object.
3013+
///
3014+
/// \return PI_SUCCESS on success. PI_INVALID_VALUE if given an invalid property
3015+
/// or if there is multiple of properties from the same category.
29933016
pi_result cuda_piSamplerCreate(pi_context context,
2994-
const cl_sampler_properties *sampler_properties,
3017+
const pi_sampler_properties *sampler_properties,
29953018
pi_sampler *result_sampler) {
2996-
cl::sycl::detail::pi::die("cuda_piSamplerCreate not implemented");
2997-
return {};
3019+
std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
3020+
3021+
bool propSeen[3] = {false, false, false};
3022+
for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
3023+
switch (sampler_properties[i]) {
3024+
case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS:
3025+
if (propSeen[0]) {
3026+
return PI_INVALID_VALUE;
3027+
}
3028+
propSeen[0] = true;
3029+
retImplSampl->props_ |= sampler_properties[i + 1];
3030+
break;
3031+
case PI_SAMPLER_PROPERTIES_FILTER_MODE:
3032+
if (propSeen[1]) {
3033+
return PI_INVALID_VALUE;
3034+
}
3035+
propSeen[1] = true;
3036+
retImplSampl->props_ |=
3037+
(sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
3038+
break;
3039+
case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE:
3040+
if (propSeen[2]) {
3041+
return PI_INVALID_VALUE;
3042+
}
3043+
propSeen[2] = true;
3044+
retImplSampl->props_ |=
3045+
(sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
3046+
break;
3047+
default:
3048+
return PI_INVALID_VALUE;
3049+
}
3050+
}
3051+
3052+
if (!propSeen[0]) {
3053+
retImplSampl->props_ |= CL_TRUE;
3054+
}
3055+
// Default filter mode to CL_FILTER_NEAREST
3056+
if (!propSeen[2]) {
3057+
retImplSampl->props_ |= (CL_ADDRESS_CLAMP % CL_ADDRESS_NONE) << 2;
3058+
}
3059+
3060+
*result_sampler = retImplSampl.release();
3061+
return PI_SUCCESS;
29983062
}
29993063

3000-
/// \TODO Not implemented in CUDA, need untie from OpenCL
3064+
/// Gets information from a PI sampler object
3065+
///
3066+
/// \param[in] sampler The sampler to get the information from.
3067+
/// \param[in] param_name The name of the information to get.
3068+
/// \param[in] param_value_size The size of the param_value.
3069+
/// \param[out] param_value Set to information value.
3070+
/// \param[out] param_value_size_ret Set to the size of the information value.
3071+
///
3072+
/// \return PI_SUCCESS on success.
30013073
pi_result cuda_piSamplerGetInfo(pi_sampler sampler, cl_sampler_info param_name,
30023074
size_t param_value_size, void *param_value,
30033075
size_t *param_value_size_ret) {
3004-
cl::sycl::detail::pi::die("cuda_piSamplerGetInfo not implemented");
3076+
assert(sampler != nullptr);
3077+
3078+
switch (param_name) {
3079+
case PI_SAMPLER_INFO_REFERENCE_COUNT:
3080+
return getInfo(param_value_size, param_value, param_value_size_ret,
3081+
sampler->get_reference_count());
3082+
case PI_SAMPLER_INFO_CONTEXT:
3083+
return getInfo(param_value_size, param_value, param_value_size_ret,
3084+
sampler->context_);
3085+
case PI_SAMPLER_INFO_NORMALIZED_COORDS: {
3086+
pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
3087+
return getInfo(param_value_size, param_value, param_value_size_ret,
3088+
norm_coords_prop);
3089+
}
3090+
case PI_SAMPLER_INFO_FILTER_MODE: {
3091+
pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
3092+
((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
3093+
return getInfo(param_value_size, param_value, param_value_size_ret,
3094+
filter_prop);
3095+
}
3096+
case PI_SAMPLER_INFO_ADDRESSING_MODE: {
3097+
pi_sampler_addressing_mode addressing_prop =
3098+
static_cast<pi_sampler_addressing_mode>(
3099+
(sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
3100+
return getInfo(param_value_size, param_value, param_value_size_ret,
3101+
addressing_prop);
3102+
}
3103+
default:
3104+
PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
3105+
}
30053106
return {};
30063107
}
30073108

3008-
/// \TODO Not implemented in CUDA, need untie from OpenCL
3109+
/// Retains a PI sampler object, incrementing its reference count.
3110+
///
3111+
/// \param[in] sampler The sampler to increment the reference count of.
3112+
///
3113+
/// \return PI_SUCCESS.
30093114
pi_result cuda_piSamplerRetain(pi_sampler sampler) {
3010-
cl::sycl::detail::pi::die("cuda_piSamplerRetain not implemented");
3011-
return {};
3115+
assert(sampler != nullptr);
3116+
sampler->increment_reference_count();
3117+
return PI_SUCCESS;
30123118
}
30133119

3014-
/// \TODO Not implemented in CUDA, need untie from OpenCL
3120+
/// Releases a PI sampler object, decrementing its reference count. If the
3121+
/// reference count reaches zero, the sampler object is destroyed.
3122+
///
3123+
/// \param[in] sampler The sampler to decrement the reference count of.
3124+
///
3125+
/// \return PI_SUCCESS.
30153126
pi_result cuda_piSamplerRelease(pi_sampler sampler) {
3016-
cl::sycl::detail::pi::die("cuda_piSamplerRelease not implemented");
3017-
return {};
3127+
assert(sampler != nullptr);
3128+
3129+
// double delete or someone is messing with the ref count.
3130+
// either way, cannot safely proceed.
3131+
cl::sycl::detail::pi::assertion(
3132+
sampler->get_reference_count() != 0,
3133+
"Reference count overflow detected in cuda_piSamplerRelease.");
3134+
3135+
// decrement ref count. If it is 0, delete the sampler.
3136+
if (sampler->decrement_reference_count() == 0) {
3137+
delete sampler;
3138+
}
3139+
3140+
return PI_SUCCESS;
30183141
}
30193142

30203143
/// General 3D memory copy operation.
@@ -3934,6 +4057,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
39344057
_PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo)
39354058

39364059
_PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
4060+
_PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
39374061

39384062
#undef _PI_CL
39394063

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,26 @@ struct _pi_kernel {
617617
void clear_local_size() { args_.clear_local_size(); }
618618
};
619619

620+
/// Implementation of samplers for CUDA
621+
///
622+
/// Sampler property layout:
623+
/// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 |
624+
/// | N/A | addressing mode | fiter mode | normalize coords |
625+
struct _pi_sampler {
626+
std::atomic_uint32_t refCount_;
627+
pi_uint32 props_;
628+
pi_context context_;
629+
630+
_pi_sampler(pi_context context)
631+
: refCount_(1), props_(0), context_(context) {}
632+
633+
pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
634+
635+
pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
636+
637+
pi_uint32 get_reference_count() const noexcept { return refCount_; }
638+
};
639+
620640
// -------------------------------------------------------------
621641
// Helper types and functions
622642
//

sycl/plugins/level_zero/pi_level0.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2008,7 +2008,7 @@ pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize,
20082008
return PI_SUCCESS;
20092009
}
20102010

2011-
// Special version of piKernelSetArg to accept pi_mem and pi_sampler.
2011+
// Special version of piKernelSetArg to accept pi_mem.
20122012
pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex,
20132013
const pi_mem *ArgValue) {
20142014
// TODO: the better way would probably be to add a new PI API for
@@ -2025,6 +2025,13 @@ pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex,
20252025
return PI_SUCCESS;
20262026
}
20272027

2028+
// Special version of piKernelSetArg to accept pi_sampler.
2029+
pi_result piextKernelSetArgSampler(pi_kernel Kernel, pi_uint32 ArgIndex,
2030+
const pi_sampler *ArgValue) {
2031+
die("piextKernelSetArgSampler: not implemented");
2032+
return {};
2033+
}
2034+
20282035
pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName,
20292036
size_t ParamValueSize, void *ParamValue,
20302037
size_t *ParamValueSizeRet) {

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,13 @@ pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
449449
sizeof(arg_value), cast<const cl_mem *>(arg_value)));
450450
}
451451

452+
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
453+
const pi_sampler *arg_value) {
454+
return cast<pi_result>(
455+
clSetKernelArg(cast<cl_kernel>(kernel), cast<cl_uint>(arg_index),
456+
sizeof(cl_sampler), cast<const cl_sampler *>(arg_value)));
457+
}
458+
452459
pi_result piextGetDeviceFunctionPointer(pi_device device, pi_program program,
453460
const char *func_name,
454461
pi_uint64 *function_pointer_ret) {
@@ -1234,6 +1241,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
12341241
_PI_CL(piextUSMGetMemAllocInfo, piextUSMGetMemAllocInfo)
12351242

12361243
_PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)
1244+
_PI_CL(piextKernelSetArgSampler, piextKernelSetArgSampler)
12371245

12381246
#undef _PI_CL
12391247

sycl/source/detail/scheduler/commands.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,8 +1671,8 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch(
16711671
sampler *SamplerPtr = (sampler *)Arg.MPtr;
16721672
RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
16731673
->getOrCreateSampler(MQueue->get_context());
1674-
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
1675-
sizeof(cl_sampler), &Sampler);
1674+
Plugin.call<PiApiKind::piextKernelSetArgSampler>(Kernel, Arg.MIndex,
1675+
&Sampler);
16761676
break;
16771677
}
16781678
case kernel_param_kind_t::kind_pointer: {

sycl/test/abi/pi_level0_symbol_check.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ piProgramGetInfo
7777
piextGetDeviceFunctionPointer
7878
piEnqueueMemUnmap
7979
piextKernelSetArgMemObj
80+
piextKernelSetArgSampler
8081
piQueueCreate
8182
piEventCreate
8283
piKernelGetInfo

sycl/test/abi/pi_opencl_symbol_check.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ piextEventCreateWithNativeHandle
2828
piextGetDeviceFunctionPointer
2929
piextProgramGetNativeHandle
3030
piextKernelSetArgMemObj
31+
piextKernelSetArgSampler
3132
piextKernelSetArgPointer
3233
piextMemCreateWithNativeHandle
3334
piextMemGetNativeHandle

0 commit comments

Comments
 (0)