Skip to content

[SYCL][USM] Enable per-context USM behavior. Use PI interfaces and avoid directly calling CL inside SYCL RT. #517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 2 additions & 32 deletions sycl/include/CL/sycl/detail/clusm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,8 @@ namespace usm {

class CLUSM {
public:
static bool Create(CLUSM *&pCLUSM);
static void Delete(CLUSM *&pCLUSM);

void initExtensions(cl_platform_id platform);
CLUSM() = default;
~CLUSM() = default;

void *hostMemAlloc(cl_context context, cl_mem_properties_intel *properties,
size_t size, cl_uint alignment, cl_int *errcode_ret);
Expand All @@ -54,18 +52,9 @@ class CLUSM {
cl_int writeParamToMemory(size_t param_value_size, T param,
size_t *param_value_size_ret, T *pointer) const;

bool useCLUSM() { return mEnableCLUSM; }

bool isInitialized() { return mInitialized; }

private:
bool mEnableCLUSM = true;
bool mInitialized = false;
std::mutex mLock;

CLUSM() = default;
~CLUSM() = default;

struct SUSMAllocInfo {
SUSMAllocInfo() = default;

Expand Down Expand Up @@ -106,25 +95,6 @@ class CLUSM {
};

} // namespace usm

namespace cliext {
bool initializeExtensions(cl_platform_id platform);
} // namespace cliext

} // namespace detail
} // namespace sycl
} // namespace cl

__SYCL_EXPORTED extern cl::sycl::detail::usm::CLUSM *gCLUSM;
inline cl::sycl::detail::usm::CLUSM *GetCLUSM() {
if (gCLUSM == nullptr) {
cl::sycl::detail::usm::CLUSM::Create(gCLUSM);
}

cl::sycl::detail::usm::CLUSM *retVal = nullptr;
if (cl::sycl::detail::pi::useBackend(
cl::sycl::detail::pi::Backend::SYCL_BE_PI_OPENCL)) {
retVal = gCLUSM;
}
return retVal;
}
3 changes: 3 additions & 0 deletions sycl/include/CL/sycl/detail/context_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <CL/sycl/detail/common.hpp>
#include <CL/sycl/detail/os_util.hpp>
#include <CL/sycl/detail/pi.hpp>
#include <CL/sycl/detail/usm_dispatch.hpp>
#include <CL/sycl/exception_list.hpp>
#include <CL/sycl/info/info_desc.hpp>
#include <CL/sycl/platform.hpp>
Expand Down Expand Up @@ -64,6 +65,7 @@ class context_impl {
return m_CachedKernels;
}

std::shared_ptr<usm::USMDispatcher> getUSMDispatch() const;
private:
async_handler m_AsyncHandler;
vector_class<device> m_Devices;
Expand All @@ -73,6 +75,7 @@ class context_impl {
bool m_HostContext;
std::map<OSModuleHandle, RT::PiProgram> m_CachedPrograms;
std::map<RT::PiProgram, std::map<string_class, RT::PiKernel>> m_CachedKernels;
std::shared_ptr<usm::USMDispatcher> m_USMDispatch;
};

} // namespace detail
Expand Down
73 changes: 73 additions & 0 deletions sycl/include/CL/sycl/detail/usm_dispatch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//==-------------- usm_dispatch.hpp - SYCL USM Dispatch --------*- C++ -*---==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// ===--------------------------------------------------------------------=== //
#pragma once

#include <CL/sycl/detail/clusm.hpp>

#include <memory>

namespace cl {
namespace sycl {
namespace detail {
namespace usm {

class USMDispatcher {
public:
USMDispatcher(cl_platform_id Platform);

void *hostMemAlloc(pi_context Context, cl_mem_properties_intel *Properties,
size_t Size, pi_uint32 Alignment, pi_result *ErrcodeRet);
void *deviceMemAlloc(pi_context Context, pi_device Device,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you going to move these functions to pi.hpp/pi.cpp ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I think it's better to keep them here as we need some state (the CLUSM object). In some sense the Dispatcher is separate piece of PI. It's easy to wire new backends up here.

cl_mem_properties_intel *Properties, size_t Size,
pi_uint32 Alignment, pi_result *ErrcodeRet);
void *sharedMemAlloc(pi_context Context, pi_device Device,
cl_mem_properties_intel *Properties, size_t Size,
pi_uint32 Alignment, pi_result *ErrcodeRet);
pi_result memFree(pi_context Context, void *Ptr);
pi_result setKernelArgMemPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
const void *ArgValue);
void setKernelIndirectAccess(pi_kernel Kernel, pi_queue Queue);
pi_result enqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value,
size_t Count, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *Event);
pi_result enqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DestPtr,
const void *SrcPtr, size_t Size,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *Event);
pi_result enqueueMigrateMem(pi_queue Queue, const void *Ptr, size_t Size,
cl_mem_migration_flags Flags,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *Event);
pi_result enqueueMemAdvise(pi_queue Queue, void *Ptr, size_t Size,
cl_mem_advice_intel Advice,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *Event);
pi_result getMemAllocInfo(pi_context Context, const void *Ptr,
cl_mem_info_intel ParamName, size_t ParamValueSize,
void *ParamValue, size_t *ParamValueSizeRet);

private:
bool mEmulated = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably instead of having this var you can check if mEmulator is nullptr or not.

std::unique_ptr<CLUSM> mEmulator;

clHostMemAllocINTEL_fn pfn_clHostMemAllocINTEL = nullptr;
clDeviceMemAllocINTEL_fn pfn_clDeviceMemAllocINTEL = nullptr;
clSharedMemAllocINTEL_fn pfn_clSharedMemAllocINTEL = nullptr;
clMemFreeINTEL_fn pfn_clMemFreeINTEL = nullptr;
clGetMemAllocInfoINTEL_fn pfn_clGetMemAllocInfoINTEL = nullptr;
clSetKernelArgMemPointerINTEL_fn pfn_clSetKernelArgMemPointerINTEL = nullptr;
clEnqueueMemsetINTEL_fn pfn_clEnqueueMemsetINTEL = nullptr;
clEnqueueMemcpyINTEL_fn pfn_clEnqueueMemcpyINTEL = nullptr;
clEnqueueMigrateMemINTEL_fn pfn_clEnqueueMigrateMemINTEL = nullptr;
clEnqueueMemAdviseINTEL_fn pfn_clEnqueueMemAdviseINTEL = nullptr;
};

} // namespace usm
} // namespace detail
} // namespace sycl
} // namespace cl
9 changes: 6 additions & 3 deletions sycl/include/CL/sycl/detail/usm_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@ namespace sycl {
namespace detail {
namespace usm {

void *alignedAlloc(size_t alignment, size_t bytes, const context *ctxt,
const device *dev, cl::sycl::usm::alloc kind);
void *alignedAlloc(size_t Alignment, size_t Bytes, const context &Ctxt,
const device &Dev, cl::sycl::usm::alloc Kind);

void free(void *ptr, const context *ctxt);
void *alignedAlloc(size_t Alignment, size_t Bytes, const context &Ctxt,
cl::sycl::usm::alloc Kind);

void free(void *Ptr, const context &Ctxt);

} // namespace usm
} // namespace detail
Expand Down
27 changes: 19 additions & 8 deletions sycl/include/CL/sycl/usm/usm_allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ class usm_allocator {
typedef usm_allocator<U, AllocKind, Alignment> other;
};

usm_allocator() : mContext(nullptr), mDevice(nullptr) {}
usm_allocator(const context *ctxt, const device *dev)
: mContext(ctxt), mDevice(dev) {}
usm_allocator(const usm_allocator &other)
: mContext(other.mContext), mDevice(other.mDevice) {}
usm_allocator() = delete;
usm_allocator(const context &Ctxt, const device &Dev)
: mContext(Ctxt), mDevice(Dev) {}
usm_allocator(const usm_allocator &Other)
: mContext(Other.mContext), mDevice(Other.mDevice) {}

// Construct an object
// Note: AllocKind == alloc::device is not allowed
Expand Down Expand Up @@ -105,10 +105,21 @@ class usm_allocator {
}

// Allocate memory
template <
usm::alloc AllocT = AllocKind,
typename std::enable_if<AllocT == usm::alloc::host, int>::type = 0>
pointer allocate(size_t Size) {
if (!mContext && !mDevice) {
auto Result = reinterpret_cast<pointer>(detail::usm::alignedAlloc(
getAlignment(), Size * sizeof(value_type), mContext, AllocKind));
if (!Result) {
throw memory_allocation_error();
}
return Result;
}

template <usm::alloc AllocT = AllocKind,
typename std::enable_if<AllocT != usm::alloc::host, int>::type = 0>
pointer allocate(size_t Size) {
auto Result = reinterpret_cast<pointer>(
detail::usm::alignedAlloc(getAlignment(), Size * sizeof(value_type),
mContext, mDevice, AllocKind));
Expand Down Expand Up @@ -137,8 +148,8 @@ class usm_allocator {
return Alignment;
}

const context *mContext;
const device *mDevice;
const context mContext;
const device mDevice;
};

} // namespace sycl
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ add_library(sycl SHARED
"detail/scheduler/graph_processor.cpp"
"detail/scheduler/graph_builder.cpp"
"detail/usm/clusm.cpp"
"detail/usm/opencl_shim.cpp"
"detail/usm/usm_dispatch.cpp"
"detail/usm/usm_impl.cpp"
"detail/util.cpp"
"context.cpp"
Expand Down
9 changes: 5 additions & 4 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ context_impl::context_impl(const vector_class<cl::sycl::device> Devices,
RT::piContextCreate(0, DeviceIds.size(), DeviceIds.data(), 0, 0, &Err),
Err));

if (usm::CLUSM* clusm = GetCLUSM()) {
cl_platform_id id = m_Platform.get();
clusm->initExtensions(id);
}
m_USMDispatch.reset(new usm::USMDispatcher(m_Platform.get()));
}

context_impl::context_impl(cl_context ClContext, async_handler AsyncHandler)
Expand Down Expand Up @@ -126,6 +123,10 @@ context_impl::get_info<info::context::devices>() const {
RT::PiContext &context_impl::getHandleRef() { return m_Context; }
const RT::PiContext &context_impl::getHandleRef() const { return m_Context; }

std::shared_ptr<usm::USMDispatcher> context_impl::getUSMDispatch() const {
return m_USMDispatch;
}

} // namespace detail
} // namespace sycl
} // namespace cl
40 changes: 20 additions & 20 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <CL/sycl/detail/clusm.hpp>
#include <CL/sycl/detail/pi.hpp>
#include <CL/sycl/detail/queue_impl.hpp>
#include <CL/sycl/detail/usm_dispatch.hpp>
#include <CL/sycl/device.hpp>

namespace cl {
Expand All @@ -32,32 +33,31 @@ template <> device queue_impl::get_info<info::queue::device>() const {
return get_device();
}

// TODO: Update with PI interfaces
event queue_impl::memset(void* ptr, int value, size_t count) {
cl_event e;
cl_int error;
cl_command_queue q = pi::cast<cl_command_queue>(getHandleRef());
event queue_impl::memset(void *Ptr, int Value, size_t Count) {
context Context = get_context();
std::shared_ptr<usm::USMDispatcher> USMDispatch =
getSyclObjImpl(Context)->getUSMDispatch();
cl_event Event;

error = clEnqueueMemsetINTEL(q, ptr, value, count,
/* sizeof waitlist */ 0, nullptr, &e);
PI_CHECK(USMDispatch->enqueueMemset(getHandleRef(), Ptr, Value, Count,
/* sizeof waitlist */ 0, nullptr,
reinterpret_cast<pi_event *>(&Event)));

CHECK_OCL_CODE_THROW(error, runtime_error);

return event(e, get_context());
return event(Event, Context);
}

event queue_impl::memcpy(void* dest, const void* src, size_t count) {
cl_event e;
cl_int error;
cl_command_queue q = pi::cast<cl_command_queue>(getHandleRef());

error = clEnqueueMemcpyINTEL(q,
/* blocking */ false, dest, src, count,
/* sizeof waitlist */ 0, nullptr, &e);
event queue_impl::memcpy(void *Dest, const void *Src, size_t Count) {
context Context = get_context();
std::shared_ptr<usm::USMDispatcher> USMDispatch =
getSyclObjImpl(Context)->getUSMDispatch();
cl_event Event;

CHECK_OCL_CODE_THROW(error, runtime_error);
PI_CHECK(USMDispatch->enqueueMemcpy(getHandleRef(),
/* blocking */ false, Dest, Src, Count,
/* sizeof waitlist */ 0, nullptr,
reinterpret_cast<pi_event *>(&Event)));

return event(e, get_context());
return event(Event, Context);
}
} // namespace detail
} // namespace sycl
Expand Down
42 changes: 8 additions & 34 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -660,10 +660,12 @@ cl_int ExecCGCommand::enqueueImp() {
case kernel_param_kind_t::kind_pointer: {
// TODO: Change to PI
usesUSM = true;
std::shared_ptr<usm::USMDispatcher> USMDispatch =
getSyclObjImpl(Context)->getUSMDispatch();
auto PtrToPtr = reinterpret_cast<intptr_t*>(Arg.MPtr);
auto DerefPtr = reinterpret_cast<void*>(*PtrToPtr);
auto theKernel = pi::cast<cl_kernel>(Kernel);
CHECK_OCL_CODE(clSetKernelArgMemPointerINTEL(theKernel, Arg.MIndex, DerefPtr));
pi::cast<RT::PiResult>(
USMDispatch->setKernelArgMemPointer(Kernel, Arg.MIndex, DerefPtr));
break;
}
default:
Expand All @@ -675,38 +677,10 @@ cl_int ExecCGCommand::enqueueImp() {
detail::getSyclObjImpl(
MQueue->get_device())->getHandleRef());

// TODO: Replace CL with PI
auto clusm = GetCLUSM();
if (usesUSM && clusm) {
cl_bool t = CL_TRUE;
auto theKernel = pi::cast<cl_kernel>(Kernel);
// Enable USM Indirect Access for Kernels
if (clusm->useCLUSM()) {
CHECK_OCL_CODE(clusm->setKernelExecInfo(
theKernel, CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL,
sizeof(cl_bool), &t));
CHECK_OCL_CODE(clusm->setKernelExecInfo(
theKernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL,
sizeof(cl_bool), &t));
CHECK_OCL_CODE(clusm->setKernelExecInfo(
theKernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL,
sizeof(cl_bool), &t));

// This passes all the allocations we've tracked as SVM Pointers
CHECK_OCL_CODE(clusm->setKernelIndirectUSMExecInfo(
pi::cast<cl_command_queue>(MQueue->getHandleRef()), theKernel));
} else if (clusm->isInitialized()) {
// Sanity check that nothing went wrong setting up clusm
CHECK_OCL_CODE(clSetKernelExecInfo(
theKernel, CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL,
sizeof(cl_bool), &t));
CHECK_OCL_CODE(clSetKernelExecInfo(
theKernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL,
sizeof(cl_bool), &t));
CHECK_OCL_CODE(clSetKernelExecInfo(
theKernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL,
sizeof(cl_bool), &t));
}
if (usesUSM) {
std::shared_ptr<usm::USMDispatcher> USMDispatch =
getSyclObjImpl(Context)->getUSMDispatch();
USMDispatch->setKernelIndirectAccess(Kernel, MQueue->getHandleRef());
}

PI_CALL(RT::piEnqueueKernelLaunch(
Expand Down
Loading