-
Notifications
You must be signed in to change notification settings - Fork 787
[SYCL] Adds support for atomic fence capabilities device queries #8586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
435d760
d126996
fbb2998
8e07de6
8327cb0
46e315a
4bad01d
3a24574
eeb02e9
8829169
fc36ecc
0ce9beb
60132b7
36192ee
c1cf809
bf12a92
9548073
4c59edb
3ab7991
f19eb4c
9e7947c
806e054
c55f52a
b01fef5
30bc569
1571410
cda1cd3
45dd762
d4a5d37
918b923
6490a36
924b9b7
879c096
aea9486
2522df7
8d37dfb
2e1d848
38b415f
20f5e18
bef2e36
626b231
cc6166e
ffc9b92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -77,9 +77,11 @@ | |||||||||
// 12.22 Add piGetDeviceAndHostTimer to query device wall-clock timestamp | ||||||||||
// 12.23 Added new piextEnqueueDeviceGlobalVariableWrite and | ||||||||||
// piextEnqueueDeviceGlobalVariableRead functions. | ||||||||||
// 12.24 Added PI_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES and | ||||||||||
// PI_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES for piDeviceGetInfo. | ||||||||||
|
||||||||||
#define _PI_H_VERSION_MAJOR 12 | ||||||||||
#define _PI_H_VERSION_MINOR 23 | ||||||||||
#define _PI_H_VERSION_MINOR 24 | ||||||||||
|
||||||||||
#define _PI_STRING_HELPER(a) #a | ||||||||||
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) | ||||||||||
|
@@ -186,7 +188,7 @@ typedef enum : pi_uint64 { | |||||||||
PI_DEVICE_TYPE_CPU = (1 << 1), ///< A PI device that is the host processor. | ||||||||||
PI_DEVICE_TYPE_GPU = (1 << 2), ///< A PI device that is a GPU. | ||||||||||
PI_DEVICE_TYPE_ACC = (1 << 3), ///< A PI device that is a | ||||||||||
///< dedicated accelerator. | ||||||||||
///< dedicated accelerator. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't change the layout of those comments, unless clang format complains at it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My local clang-format overreacted. I will revert the changes. |
||||||||||
PI_DEVICE_TYPE_CUSTOM = (1 << 4) ///< A PI device that is a custom device. | ||||||||||
} _pi_device_type; | ||||||||||
|
||||||||||
|
@@ -313,6 +315,8 @@ typedef enum { | |||||||||
PI_DEVICE_INFO_ATOMIC_64 = 0x10110, | ||||||||||
PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111, | ||||||||||
PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000, | ||||||||||
PI_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 0x10114, | ||||||||||
PI_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 0x10115, | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you rename these and use contiguous values from 0x1FFFF just below
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The file already has There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's up to you if you want to fix unrelated misses. |
||||||||||
PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 0x10112, | ||||||||||
PI_DEVICE_INFO_BACKEND_VERSION = 0x10113, | ||||||||||
// Return whether bfloat16 math functions are supported by device | ||||||||||
|
@@ -561,6 +565,19 @@ constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_GROUP = 0x04; | |||||||||
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_DEVICE = 0x08; | ||||||||||
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_SYSTEM = 0x10; | ||||||||||
|
||||||||||
// CL equivalents are only available for OpenCL version 3.0 | ||||||||||
#define PI_DEVICE_ATOMIC_FENCE_CAPABILITIES 0x1064 | ||||||||||
using pi_device_atomic_capabilities = pi_bitfield; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_ORDER_RELAXED = 0x01; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_ORDER_ACQ_REL = 0x02; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_ORDER_SEQ_CST = 0x04; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_SCOPE_WORK_ITEM = 0x08; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_SCOPE_WORK_GROUP = | ||||||||||
0x10; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_SCOPE_DEVICE = 0x20; | ||||||||||
constexpr pi_device_atomic_capabilities PI_DEVICE_ATOMIC_SCOPE_ALL_DEVICES = | ||||||||||
0x40; | ||||||||||
|
||||||||||
typedef enum { | ||||||||||
PI_PROFILING_INFO_COMMAND_QUEUED = 0x1280, | ||||||||||
PI_PROFILING_INFO_COMMAND_SUBMIT = 0x1281, | ||||||||||
|
@@ -611,13 +628,13 @@ using pi_queue_properties = pi_bitfield; | |||||||||
constexpr pi_queue_properties PI_QUEUE_FLAGS = -1; | ||||||||||
constexpr pi_queue_properties PI_QUEUE_COMPUTE_INDEX = -2; | ||||||||||
// clang-format off | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE = (1 << 0); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_PROFILING_ENABLE = (1 << 1); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_ON_DEVICE = (1 << 2); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_ON_DEVICE_DEFAULT = (1 << 3); | ||||||||||
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS = (1 << 4); | ||||||||||
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW = (1 << 5); | ||||||||||
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH = (1 << 6); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE = (1 << 0); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_PROFILING_ENABLE = (1 << 1); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_ON_DEVICE = (1 << 2); | ||||||||||
constexpr pi_queue_properties PI_QUEUE_FLAG_ON_DEVICE_DEFAULT = (1 << 3); | ||||||||||
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS = (1 << 4); | ||||||||||
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW = (1 << 5); | ||||||||||
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH = (1 << 6); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the whitespace change necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My local clang-format overreacted. I will revert the changes. |
||||||||||
// clang-format on | ||||||||||
|
||||||||||
using pi_result = _pi_result; | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1315,6 +1315,10 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, | |
return getInfo(param_value_size, param_value, param_value_size_ret, | ||
capabilities); | ||
} | ||
case PI_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: | ||
case PI_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: | ||
// There is no way to query this in the backend | ||
return PI_ERROR_INVALID_ARG_VALUE; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How is this error reported to the end user? Should we better use set a plugin specific error and have SYCL RT use piPluginGetLastError to retrieve/report it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: { | ||
int major = 0; | ||
sycl::detail::pi::assertion( | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -5,6 +5,7 @@ | |||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||||
// | ||||||||
//===-----------------------------------------------------------------===// | ||||||||
#include <sycl/detail/pi.h> | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. UR should not include PI
Suggested change
|
||||||||
|
||||||||
#include <algorithm> | ||||||||
#include <climits> | ||||||||
|
@@ -1164,6 +1165,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( | |||||||
// bfloat16 math functions are not yet supported on Intel GPUs. | ||||||||
return ReturnValue(bool{false}); | ||||||||
} | ||||||||
case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { | ||||||||
// There are no explicit restrictions in L0 programming guide, so assume all | ||||||||
// are supported | ||||||||
pi_memory_order_capabilities result = | ||||||||
PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE | | ||||||||
PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL | | ||||||||
PI_MEMORY_ORDER_SEQ_CST; | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should write this in UR (extended as needed) and add a conversion in |
||||||||
|
||||||||
return ReturnValue(result); | ||||||||
} | ||||||||
case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { | ||||||||
// There are no explicit restrictions in L0 programming guide, so assume all | ||||||||
// are supported | ||||||||
pi_memory_scope_capabilities result = | ||||||||
PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP | | ||||||||
PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE | | ||||||||
PI_MEMORY_SCOPE_SYSTEM; | ||||||||
|
||||||||
return ReturnValue(result); | ||||||||
} | ||||||||
|
||||||||
// TODO: Implement. | ||||||||
case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,10 @@ const int UR_EXT_DEVICE_INFO_FREE_MEMORY = UR_EXT_DEVICE_INFO_END - 13; | |
// const int ZER_EXT_DEVICE_INFO_DEVICE_ID = UR_EXT_DEVICE_INFO_END - 14; | ||
// const int ZER_EXT_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = | ||
// UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE; | ||
const int UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = | ||
UR_EXT_DEVICE_INFO_END - 16; | ||
const int UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = | ||
UR_EXT_DEVICE_INFO_END - 17; | ||
Comment on lines
+46
to
+49
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kbenzie: please take this change into UR |
||
|
||
const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION = | ||
(ur_device_info_t)0x103D; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -275,6 +275,19 @@ struct get_device_info_impl<std::vector<memory_order>, | |
} | ||
}; | ||
|
||
// Specialization for atomic_fence_order_capabilities, PI returns a bitfield | ||
template <> | ||
struct get_device_info_impl<std::vector<memory_order>, | ||
info::device::atomic_fence_order_capabilities> { | ||
static std::vector<memory_order> get(RT::PiDevice dev, const plugin &Plugin) { | ||
pi_memory_order_capabilities result; | ||
Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why |
||
dev, PiInfoCode<info::device::atomic_fence_order_capabilities>::value, | ||
sizeof(pi_memory_order_capabilities), &result, nullptr); | ||
return readMemoryOrderBitfield(result); | ||
} | ||
}; | ||
|
||
// Specialization for atomic_memory_scope_capabilities, PI returns a bitfield | ||
template <> | ||
struct get_device_info_impl<std::vector<memory_scope>, | ||
|
@@ -288,6 +301,19 @@ struct get_device_info_impl<std::vector<memory_scope>, | |
} | ||
}; | ||
|
||
// Specialization for atomic_fence_scope_capabilities, PI returns a bitfield | ||
template <> | ||
struct get_device_info_impl<std::vector<memory_scope>, | ||
info::device::atomic_fence_scope_capabilities> { | ||
static std::vector<memory_scope> get(RT::PiDevice dev, const plugin &Plugin) { | ||
pi_memory_scope_capabilities result; | ||
Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>( | ||
dev, PiInfoCode<info::device::atomic_fence_scope_capabilities>::value, | ||
sizeof(pi_memory_scope_capabilities), &result, nullptr); | ||
return readMemoryScopeBitfield(result); | ||
} | ||
}; | ||
|
||
// Specialization for bf16 math functions | ||
template <> | ||
struct get_device_info_impl<bool, | ||
|
@@ -1005,13 +1031,27 @@ get_device_info_host<info::device::atomic_memory_order_capabilities>() { | |
memory_order::acq_rel, memory_order::seq_cst}; | ||
} | ||
|
||
template <> | ||
inline std::vector<memory_order> | ||
get_device_info_host<info::device::atomic_fence_order_capabilities>() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder when we will be able to stop specializing that for host device, because it doesn't really exists anymore |
||
return {memory_order::relaxed, memory_order::acquire, memory_order::release, | ||
memory_order::acq_rel}; | ||
} | ||
|
||
template <> | ||
inline std::vector<memory_scope> | ||
get_device_info_host<info::device::atomic_memory_scope_capabilities>() { | ||
return {memory_scope::work_item, memory_scope::sub_group, | ||
memory_scope::work_group, memory_scope::device, memory_scope::system}; | ||
} | ||
|
||
template <> | ||
inline std::vector<memory_scope> | ||
get_device_info_host<info::device::atomic_fence_scope_capabilities>() { | ||
return {memory_scope::work_item, memory_scope::sub_group, | ||
memory_scope::work_group, memory_scope::device, memory_scope::system}; | ||
} | ||
|
||
template <> | ||
inline bool | ||
get_device_info_host<info::device::ext_oneapi_bfloat16_math_functions>() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kbenzie: please take this change to Unified Runtime
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tracking in oneapi-src/unified-runtime#399