Skip to content

[SYCL][CUDA] Use piclProgramCreateWithBinary with CUDA backend #1791

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 55 additions & 52 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,19 +389,20 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
}

_pi_program::_pi_program(pi_context ctxt)
: module_{nullptr}, source_{}, sourceLength_{0}
, refCount_{1}, context_{ctxt}
{
: module_{nullptr}, binary_{},
binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
cuda_piContextRetain(context_);
}

_pi_program::~_pi_program() {
cuda_piContextRelease(context_);
}

pi_result _pi_program::create_from_source(const char *source, size_t length) {
source_ = source;
sourceLength_ = length;
pi_result _pi_program::set_binary(const char *source, size_t length) {
assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
"Re-setting program binary data which has already been set");
binary_ = source;
binarySizeInBytes_ = length;
return PI_SUCCESS;
}

Expand All @@ -427,9 +428,9 @@ pi_result _pi_program::build_program(const char *build_options) {
options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optionVals[3] = (void *)(long)MAX_LOG_SIZE;

auto result = PI_CHECK_ERROR(cuModuleLoadDataEx(
&module_, static_cast<const void *>(source_), numberOfOptions, options,
optionVals));
auto result = PI_CHECK_ERROR(
cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
numberOfOptions, options, optionVals));

const auto success = (result == PI_SUCCESS);

Expand All @@ -446,8 +447,8 @@ pi_result _pi_program::build_program(const char *build_options) {
/// has_kernel method, so an alternative would be to move the has_kernel
/// query to PI and use cuModuleGetFunction to check for a kernel.
std::string getKernelNames(pi_program program) {
std::string source(program->source_,
program->source_ + program->sourceLength_);
std::string source(program->binary_,
program->binary_ + program->binarySizeInBytes_);
std::regex entries_pattern(".entry\\s+([^\\([:s:]]*)");
std::string names("");
std::smatch match;
Expand Down Expand Up @@ -2172,41 +2173,15 @@ pi_result cuda_piMemRetain(pi_mem mem) {
return PI_SUCCESS;
}

/// Constructs a PI program from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// \TODO Implement more than one input image
/// \TODO SYCL RT should use cuda_piclprogramCreateWithBinary instead
/// Not used as CUDA backend only creates programs from binary.
/// See \ref cuda_piclProgramCreateWithBinary.
///
pi_result cuda_piclProgramCreateWithSource(pi_context context, pi_uint32 count,
const char **strings,
const size_t *lengths,
pi_program *program) {

assert(context != nullptr);
assert(strings != nullptr);
assert(program != nullptr);

pi_result retErr = PI_SUCCESS;

if (count == 0) {
retErr = PI_INVALID_PROGRAM;
return retErr;
}

assert(count == 1);

std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};

auto has_length = (lengths != nullptr);
size_t length = has_length ? lengths[0] : strlen(strings[0]) + 1;

retProgram->create_from_source(strings[0], length);

*program = retProgram.release();

return retErr;
cl::sycl::detail::pi::die("cuda_piclProgramCreateWithSource not implemented");
return {};
}

/// Loads the images from a PI program into a CUmodule that can be
Expand Down Expand Up @@ -2244,13 +2219,41 @@ pi_result cuda_piProgramCreate(pi_context context, const void *il,
return {};
}

/// \TODO Not implemented. See \ref cuda_piclProgramCreateWithSource
/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
pi_result cuda_piclProgramCreateWithBinary(
pi_context context, pi_uint32 num_devices, const pi_device *device_list,
const size_t *lengths, const unsigned char **binaries,
pi_int32 *binary_status, pi_program *errcode_ret) {
cl::sycl::detail::pi::die("cuda_piclProgramCreateWithBinary not implemented");
return {};
pi_int32 *binary_status, pi_program *program) {
assert(context != nullptr);
assert(binaries != nullptr);
assert(program != nullptr);
assert(device_list != nullptr);
assert(num_devices == 1 && "CUDA contexts are for a single device");
assert((context->get_device()->get() == device_list[0]->get()) &&
"Mismatch between devices context and passed context when creating "
"program from binary");

pi_result retError = PI_SUCCESS;

std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};

const bool has_length = (lengths != nullptr);
size_t length = has_length
? lengths[0]
: strlen(reinterpret_cast<const char *>(binaries[0])) + 1;

assert(length != 0);

retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);

*program = retProgram.release();

return retError;
}

pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
Expand All @@ -2272,13 +2275,13 @@ pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
&program->context_->deviceId_);
case PI_PROGRAM_INFO_SOURCE:
return getInfo(param_value_size, param_value, param_value_size_ret,
program->source_);
program->binary_);
case PI_PROGRAM_INFO_BINARY_SIZES:
return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
&program->sourceLength_);
&program->binarySizeInBytes_);
case PI_PROGRAM_INFO_BINARIES:
return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
&program->source_);
&program->binary_);
case PI_PROGRAM_INFO_KERNEL_NAMES: {
return getInfo(param_value_size, param_value, param_value_size_ret,
getKernelNames(program).c_str());
Expand Down Expand Up @@ -2320,15 +2323,15 @@ pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices,
for (size_t i = 0; i < num_input_programs; ++i) {
pi_program program = input_programs[i];
retError = PI_CHECK_ERROR(cuLinkAddData(
state, CU_JIT_INPUT_PTX, const_cast<char *>(program->source_),
program->sourceLength_, nullptr, 0, nullptr, nullptr));
state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
}
void *cubin = nullptr;
size_t cubinSize = 0;
retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));

retError = retProgram->create_from_source(
static_cast<const char *>(cubin), cubinSize);
retError =
retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);

if (retError != PI_SUCCESS) {
return retError;
Expand Down
6 changes: 3 additions & 3 deletions sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,8 @@ class _pi_event {
struct _pi_program {
using native_type = CUmodule;
native_type module_;
const char *source_;
size_t sourceLength_;
const char *binary_;
size_t binarySizeInBytes_;
std::atomic_uint32_t refCount_;
_pi_context *context_;

Expand All @@ -418,7 +418,7 @@ struct _pi_program {
_pi_program(pi_context ctxt);
~_pi_program();

pi_result create_from_source(const char *source, size_t length);
pi_result set_binary(const char *binary, size_t binarySizeInBytes);

pi_result build_program(const char* build_options);

Expand Down
24 changes: 8 additions & 16 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,14 @@ static RT::PiProgram createBinaryProgram(const ContextImplPtr Context,
#endif

RT::PiProgram Program;

// TODO: Implement `piProgramCreateWithBinary` to not require extra logic for
// the CUDA backend.
const auto Backend = Context->getPlugin().getBackend();
if (Backend == backend::cuda) {
// TODO: Reemplace CreateWithSource with CreateWithBinary in CUDA backend
const char *SignedData = reinterpret_cast<const char *>(Data);
Plugin.call<PiApiKind::piclProgramCreateWithSource>(
Context->getHandleRef(), 1 /*one binary*/, &SignedData, &DataLen,
&Program);
} else {
RT::PiDevice Device = getFirstDevice(Context);
pi_int32 BinaryStatus = CL_SUCCESS;
Plugin.call<PiApiKind::piclProgramCreateWithBinary>(
Context->getHandleRef(), 1 /*one binary*/, &Device, &DataLen, &Data,
&BinaryStatus, &Program);
RT::PiDevice Device = getFirstDevice(Context);
pi_int32 BinaryStatus = CL_SUCCESS;
Plugin.call<PiApiKind::piclProgramCreateWithBinary>(
Context->getHandleRef(), 1 /*one binary*/, &Device, &DataLen, &Data,
&BinaryStatus, &Program);

if (BinaryStatus != CL_SUCCESS) {
throw runtime_error("Creating program with binary failed.", BinaryStatus);
}

return Program;
Expand Down
51 changes: 37 additions & 14 deletions sycl/unittests/pi/cuda/test_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,11 @@ const char *threeParamsTwoLocal = "\n\
TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {

pi_program prog;
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
&binary_status, &prog)),
PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
Expand All @@ -147,10 +149,16 @@ TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {
TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {

pi_program prog;
/// NOTE: `binary_status` currently unsused in the CUDA backend but in case we
/// use it at some point in the future, pass it anyway and check the result.
/// Same goes for all the other tests in this file.
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
&binary_status, &prog)),
PI_SUCCESS);
ASSERT_EQ(binary_status, PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
prog, 1, &device_, "", nullptr, nullptr)),
Expand All @@ -174,10 +182,13 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {

pi_program prog;
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
&binary_status, &prog)),
PI_SUCCESS);
ASSERT_EQ(binary_status, PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
prog, 1, &device_, "", nullptr, nullptr)),
Expand Down Expand Up @@ -210,10 +221,13 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
TEST_F(CudaKernelsTest, PIKernelSetMemObj) {

pi_program prog;
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
&binary_status, &prog)),
PI_SUCCESS);
ASSERT_EQ(binary_status, PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
prog, 1, &device_, "", nullptr, nullptr)),
Expand Down Expand Up @@ -242,10 +256,13 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
TEST_F(CudaKernelsTest, PIkerneldispatch) {

pi_program prog;
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
&binary_status, &prog)),
PI_SUCCESS);
ASSERT_EQ(binary_status, PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
prog, 1, &device_, "", nullptr, nullptr)),
Expand Down Expand Up @@ -282,10 +299,13 @@ TEST_F(CudaKernelsTest, PIkerneldispatch) {
TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {

pi_program prog;
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&twoParams, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr, (const unsigned char **)&twoParams,
&binary_status, &prog)),
PI_SUCCESS);
ASSERT_EQ(binary_status, PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
prog, 1, &device_, "", nullptr, nullptr)),
Expand Down Expand Up @@ -333,10 +353,13 @@ TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {

pi_program prog;
pi_int32 binary_status = PI_SUCCESS;
ASSERT_EQ(
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
context_, 1, (const char **)&threeParamsTwoLocal, nullptr, &prog)),
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
context_, 1, &device_, nullptr,
(const unsigned char **)&threeParamsTwoLocal, &binary_status, &prog)),
PI_SUCCESS);
ASSERT_EQ(binary_status, PI_SUCCESS);

ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
prog, 1, &device_, "", nullptr, nullptr)),
Expand Down