Skip to content

[SYCL] Switch Level Zero plugin to specification v1.0 #2408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/doc/EnvironmentVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ subject to change. Do not rely on these variables in production code.
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the DPC++ runtime will select only those devices which satisfy the regex. |
| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
| SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |

`(*) Note: Any means this environment variable is effective when set to any non-null value.`

Expand Down
4 changes: 4 additions & 0 deletions sycl/include/CL/sycl/backend/level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ template <> struct interop<backend::level_zero, device> {
using type = ze_device_handle_t;
};

template <> struct interop<backend::level_zero, context> {
using type = ze_context_handle_t;
};

template <> struct interop<backend::level_zero, queue> {
using type = ze_command_queue_handle_t;
};
Expand Down
2 changes: 1 addition & 1 deletion sycl/plugins/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
endif()
ExternalProject_Add(level-zero-loader
GIT_REPOSITORY https://github.com/oneapi-src/level-zero.git
GIT_TAG v0.91.21
GIT_TAG v1.0
UPDATE_DISCONNECTED ${SYCL_EP_LEVEL_ZERO_LOADER_SKIP_AUTO_UPDATE}
SOURCE_DIR ${LEVEL_ZERO_LOADER_SOURCE_DIR}
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_build"
Expand Down
789 changes: 493 additions & 296 deletions sycl/plugins/level_zero/pi_level_zero.cpp

Large diffs are not rendered by default.

110 changes: 83 additions & 27 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <cassert>
#include <cstring>
#include <iostream>
#include <list>
#include <map>
#include <memory>
#include <mutex>
#include <unordered_map>
Expand Down Expand Up @@ -76,18 +78,33 @@ struct _pi_platform {
// Cache pi_devices for reuse
std::vector<pi_device> PiDevicesCache;
std::mutex PiDevicesCacheMutex;
// Maximum Number of Command Lists that can be created.
// This Value is initialized to 20000, but can be changed by the user
// thru the environment variable SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE
// ie SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE =10000.
int ZeMaxCommandListCache = 0;

// Current number of L0 Command Lists created on this platform.
// this number must not exceed ZeMaxCommandListCache.
std::atomic<int> ZeGlobalCommandListCount{0};
};

struct _pi_device : _pi_object {
_pi_device(ze_device_handle_t Device, pi_platform Plt,
bool isSubDevice = false)
: ZeDevice{Device}, Platform{Plt}, ZeCommandListInit{nullptr},
IsSubDevice{isSubDevice}, ZeDeviceProperties{},
ZeDeviceComputeProperties{} {
: ZeDevice{Device}, Platform{Plt}, IsSubDevice{isSubDevice},
ZeDeviceProperties{}, ZeDeviceComputeProperties{} {
// NOTE: one must additionally call initialize() to complete
// PI device creation.
}

// Keep the ordinal of a "compute" commands group, where we send all
// commands currently.
// TODO[1.0]: discover "copy" command group as well to use for memory
// copying operations exclusively.
//
uint32_t ZeComputeQueueGroupIndex;

// Initialize the entire PI device.
pi_result initialize();

Expand All @@ -97,23 +114,28 @@ struct _pi_device : _pi_object {
// PI platform to which this device belongs.
pi_platform Platform;

// Immediate Level Zero command list for this device, to be used for
// initializations. To be created as:
// - Immediate command list: So any command appended to it is immediately
// offloaded to the device.
// - Synchronous: So implicit synchronization is made inside the level-zero
// driver.
ze_command_list_handle_t ZeCommandListInit;
// Mutex Lock for the Command List Cache
std::mutex ZeCommandListCacheMutex;
// Cache of all currently Available Command Lists for use by PI APIs
std::list<ze_command_list_handle_t> ZeCommandListCache;

// Indicates if this is a root-device or a sub-device.
// Technically this information can be queried from a device handle, but it
// seems better to just keep it here.
bool IsSubDevice;

// Create a new command list for executing on this device.
// It's caller's responsibility to remember and destroy the created
// command list when no longer needed.
pi_result createCommandList(ze_command_list_handle_t *ze_command_list);
// Retrieves a command list for executing on this device along with
// a fence to be used in tracking the execution of this command list.
// If a command list has been created on this device which has
// completed its commands, then that command list and its associated fence
// will be reused. Otherwise, a new command list and fence will be created for
// running on this device. L0 fences are created on a L0 command queue so the
// caller must pass a command queue to create a new fence for the new command
// list if a command list/fence pair is not available. All Command Lists &
// associated fences are destroyed at Device Release.
pi_result getAvailableCommandList(pi_queue Queue,
ze_command_list_handle_t *ZeCommandList,
ze_fence_handle_t *ZeFence);

// Cache of the immutable device properties.
ze_device_properties_t ZeDeviceProperties;
Expand All @@ -122,14 +144,27 @@ struct _pi_device : _pi_object {

struct _pi_context : _pi_object {
_pi_context(pi_device Device)
: Device{Device}, ZeEventPool{nullptr}, NumEventsAvailableInEventPool{},
NumEventsLiveInEventPool{} {}
: Device{Device}, ZeCommandListInit{nullptr}, ZeEventPool{nullptr},
NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {}

// A L0 context handle is primarily used during creation and management of
// resources that may be used by multiple devices.
ze_context_handle_t ZeContext;

// Level Zero does not have notion of contexts.
// Keep the device here (must be exactly one) to return it when PI context
// is queried for devices.
pi_device Device;

// Immediate Level Zero command list for the device in this context, to be
// used for initializations. To be created as:
// - Immediate command list: So any command appended to it is immediately
// offloaded to the device.
// - Synchronous: So implicit synchronization is made inside the level-zero
// driver.
// There will be a list of immediate command lists (for each device) when
// support of the multiple devices per context will be added.
ze_command_list_handle_t ZeCommandListInit;

// Get index of the free slot in the available pool. If there is no avialble
// pool then create new one.
ze_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &,
Expand Down Expand Up @@ -169,25 +204,46 @@ struct _pi_context : _pi_object {
};

struct _pi_queue : _pi_object {
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context)
: ZeCommandQueue{Queue}, Context{Context} {}
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context,
pi_device Device)
: ZeCommandQueue{Queue}, Context{Context}, Device{Device} {}

// Level Zero command queue handle.
ze_command_queue_handle_t ZeCommandQueue;

// Keeps the PI context to which this queue belongs.
pi_context Context;

// Mutex Lock for the Command List, Fence Map
std::mutex ZeCommandListFenceMapMutex;
// Map of all Command lists created with their associated Fence used for
// tracking when the command list is available for use again.
std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;

// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
// If the reset command list should be made available, then MakeAvailable
// needs to be set to true. The caller must verify that this command list and
// fence have been signalled and call while holding the
// ZeCommandListFenceMapMutex lock.
pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
bool MakeAvailable);

// Keeps the PI device to which this queue belongs.
pi_device Device;

// Attach a command list to this queue, close, and execute it.
// Note that this command list cannot be appended to after this.
// The "is_blocking" tells if the wait for completion is requested.
// The "ZeFence" passed is used to track when the command list passed
// has completed execution on the device and can be reused.
pi_result executeCommandList(ze_command_list_handle_t ZeCommandList,
ze_fence_handle_t ZeFence,
bool is_blocking = false);
};

struct _pi_mem : _pi_object {
// Keeps the PI platform of this memory handle.
pi_platform Platform;
// Keeps the PI context of this memory handle.
pi_context Context;

// Keeps the host pointer where the buffer will be mapped to,
// if created with PI_MEM_FLAGS_HOST_PTR_USE (see
Expand Down Expand Up @@ -221,8 +277,8 @@ struct _pi_mem : _pi_object {
pi_result removeMapping(void *MappedTo, Mapping &MapInfo);

protected:
_pi_mem(pi_platform Plt, char *HostPtr)
: Platform{Plt}, MapHostPtr{HostPtr}, Mappings{} {}
_pi_mem(pi_context Ctx, char *HostPtr)
: Context{Ctx}, MapHostPtr{HostPtr}, Mappings{} {}

private:
// The key is the host pointer representing an active mapping.
Expand All @@ -237,9 +293,9 @@ struct _pi_mem : _pi_object {

struct _pi_buffer final : _pi_mem {
// Buffer/Sub-buffer constructor
_pi_buffer(pi_platform Plt, char *Mem, char *HostPtr,
_pi_buffer(pi_context Ctx, char *Mem, char *HostPtr,
_pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0)
: _pi_mem(Plt, HostPtr), ZeMem{Mem}, SubBuffer{Parent, Origin, Size} {}
: _pi_mem(Ctx, HostPtr), ZeMem{Mem}, SubBuffer{Parent, Origin, Size} {}

void *getZeHandle() override { return ZeMem; }

Expand All @@ -262,8 +318,8 @@ struct _pi_buffer final : _pi_mem {

struct _pi_image final : _pi_mem {
// Image constructor
_pi_image(pi_platform Plt, ze_image_handle_t Image, char *HostPtr)
: _pi_mem(Plt, HostPtr), ZeImage{Image} {}
_pi_image(pi_context Ctx, ze_image_handle_t Image, char *HostPtr)
: _pi_mem(Ctx, HostPtr), ZeImage{Image} {}

void *getZeHandle() override { return ZeImage; }

Expand Down
15 changes: 9 additions & 6 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -777,12 +777,15 @@ ProgramManager::ProgramPtr ProgramManager::build(
LinkOpts = LinkOptions.c_str();
}

// Level-Zero plugin doesn't support piProgramCompile/piProgramLink commands,
// program is built during piProgramCreate.
// TODO: remove this check as soon as piProgramCompile/piProgramLink will be
// implemented in Level-Zero plugin.
if (Context->getPlugin().getBackend() == backend::level_zero) {
LinkDeviceLibs = false;
// The Level Zero driver support for online linking currently has bugs, but
// we think the DPC++ runtime support is ready. This environment variable
// gates the runtime support for online linking, so we can try enabling if a
// new driver is released before the next DPC++ release.
static bool EnableLevelZeroLink = std::getenv("SYCL_ENABLE_LEVEL_ZERO_LINK");
if (!EnableLevelZeroLink) {
if (Context->getPlugin().getBackend() == backend::level_zero) {
LinkDeviceLibs = false;
}
}

// TODO: this is a temporary workaround for GPU tests for ESIMD compiler.
Expand Down
49 changes: 38 additions & 11 deletions sycl/test/basic_tests/event_profiling_info.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
//
// Profiling info is not supported on host device so far.
//
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
Expand All @@ -17,16 +15,9 @@
#include <CL/sycl.hpp>
#include <cassert>

using namespace cl;

// The test checks that get_profiling_info waits for command asccociated with
// event to complete execution.
int main() {
sycl::queue Q{sycl::property::queue::enable_profiling()};
sycl::event Event = Q.submit([&](sycl::handler &CGH) {
CGH.single_task<class EmptyKernel>([=]() {});
});
using namespace cl::sycl;

bool verifyProfiling(event Event) {
auto Submit =
Event.get_profiling_info<sycl::info::event_profiling::command_submit>();
auto Start =
Expand All @@ -42,3 +33,39 @@ int main() {

return Fail;
}

// The test checks that get_profiling_info waits for command asccociated with
// event to complete execution.
int main() {
const size_t Size = 10000;
int Data[Size] = {0};
for (size_t I = 0; I < Size; ++I) {
Data[I] = I;
}
int Values[Size] = {0};

buffer<int, 1> BufferFrom(Data, range<1>(Size));
buffer<int, 1> BufferTo(Values, range<1>(Size));

// buffer copy
queue copyQueue{sycl::property::queue::enable_profiling()};
event copyEvent = copyQueue.submit([&](sycl::handler &Cgh) {
accessor<int, 1, access::mode::read, access::target::global_buffer>
AccessorFrom(BufferFrom, Cgh, range<1>(Size));
accessor<int, 1, access::mode::write, access::target::global_buffer>
AccessorTo(BufferTo, Cgh, range<1>(Size));
Cgh.copy(AccessorFrom, AccessorTo);
});

for (size_t I = 0; I < Size; ++I) {
assert(Data[I] == Values[I]);
}

// kernel launch
queue kernelQueue{sycl::property::queue::enable_profiling()};
event kernelEvent = kernelQueue.submit([&](sycl::handler &CGH) {
CGH.single_task<class EmptyKernel>([=]() {});
});

return verifyProfiling(copyEvent) || verifyProfiling(kernelEvent);
}
2 changes: 1 addition & 1 deletion sycl/tools/get_device_count_by_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ static bool queryOpenCL(cl_device_type deviceType, cl_uint &deviceCount,
static bool queryLevelZero(cl_device_type deviceType, cl_uint &deviceCount,
std::string &msg) {
deviceCount = 0u;
ze_result_t zeResult = zeInit(ZE_INIT_FLAG_NONE);
ze_result_t zeResult = zeInit(ZE_INIT_FLAG_GPU_ONLY);
if (zeResult != ZE_RESULT_SUCCESS) {
msg = "ERROR: Level Zero initialization error";
return true;
Expand Down