Skip to content

Commit 7b3f215

Browse files
authored
[SYCL] [ABI-Break] Partial implementation of sycl_ext_oneapi_cuda_cluster_group (#14113)
This PR is a partial implementation of [`sycl_ext_oneapi_cuda_cluster_group`](#13594), introducing the `cluster_size` property to launch a kernel with CUDA's thread block clusters Only a small part of the extension specification described in #13594 is used in this implementation. To be specific everything after the section "Launching a kernel with a `cluster_group`" is not included in this PR. A very important point to note is that this PR still fully represents a functional use case of using Nvidia's cuda driver cluster launch feature for its primary purpose which is to accelerate cross-work-group collective operations (particularly for GEMM), leveraging cross-work group asynchronous multi-casting of distributed shared memory across work-groups. This is a high priority feature that is targeted for the next release. The other parts of the extension specification described in #13594, primarily related to the "cluster_group" abstraction is a (user-facing) convenience abstraction that is not required to be exposed in libraries that optimize such library collective operations (GEMM). Please therefore focus reviews of this PR on the relevant aspects of the extension that are required for the implementation in this PR and the library based application of it as described in this message. --------- Signed-off-by: JackAKirk <[email protected]>
1 parent ee4bfa5 commit 7b3f215

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+822
-117
lines changed

llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">;
8282
def AspectExt_oneapi_private_alloca : Aspect<"ext_oneapi_private_alloca">;
8383
def AspectExt_oneapi_queue_profiling_tag : Aspect<"ext_oneapi_queue_profiling_tag">;
8484
def AspectExt_oneapi_virtual_mem : Aspect<"ext_oneapi_virtual_mem">;
85+
def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">;
8586
// Deprecated aspects
8687
def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
8788
def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -139,7 +140,7 @@ def : TargetInfo<"__TestAspectList",
139140
AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group,
140141
AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component,
141142
AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph,
142-
AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem],
143+
AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group],
143144
[]>;
144145
// This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
145146
// match.
@@ -205,9 +206,9 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles
205206
def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
206207
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
207208
def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
208-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
209+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
209210
def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
210-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
211+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
211212

212213
//
213214
// HIP / AMDGPU device aspects

sycl/doc/design/DeviceConfigFile.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">;
180180
def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">;
181181
def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">;
182182
def AspectExt_intel_memory_bus_width : Aspect<"ext_intel_memory_bus_width">;
183+
def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">;
183184
def AspectEmulated : Aspect<"emulated">;
184185
185186
def TargetTable : DynamicTable {

sycl/include/sycl/detail/cg.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ class CGExecKernel : public CG {
178178
std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
179179
sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig;
180180
bool MKernelIsCooperative = false;
181+
bool MKernelUsesClusterLaunch = false;
181182

182183
CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
183184
std::shared_ptr<detail::kernel_impl> SyclKernel,
@@ -188,15 +189,17 @@ class CGExecKernel : public CG {
188189
std::vector<std::shared_ptr<const void>> AuxiliaryResources,
189190
CGTYPE Type,
190191
sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
191-
bool KernelIsCooperative, detail::code_location loc = {})
192+
bool KernelIsCooperative, bool MKernelUsesClusterLaunch,
193+
detail::code_location loc = {})
192194
: CG(Type, std::move(CGData), std::move(loc)),
193195
MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
194196
MSyclKernel(std::move(SyclKernel)),
195197
MKernelBundle(std::move(KernelBundle)), MArgs(std::move(Args)),
196198
MKernelName(std::move(KernelName)), MStreams(std::move(Streams)),
197199
MAuxiliaryResources(std::move(AuxiliaryResources)),
198200
MKernelCacheConfig(std::move(KernelCacheConfig)),
199-
MKernelIsCooperative(KernelIsCooperative) {
201+
MKernelIsCooperative(KernelIsCooperative),
202+
MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) {
200203
assert(getType() == Kernel && "Wrong type of exec kernel CG.");
201204
}
202205

sycl/include/sycl/detail/cg_types.hpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class ArgDesc {
5050
int MIndex;
5151
};
5252

53-
// The structure represents NDRange - global, local sizes, global offset and
54-
// number of dimensions.
53+
// The structure represents NDRange - global, local sizes, global offset,
54+
// number of dimensions, and the cluster dimensions if applicable.
5555
class NDRDescT {
5656
// The method initializes all sizes for dimensions greater than the passed one
5757
// to the default values, so they will not affect execution.
@@ -128,13 +128,25 @@ class NDRDescT {
128128
Dims = Dims_;
129129
}
130130

131+
template <int Dims_> void setClusterDimensions(sycl::range<Dims_> N) {
132+
if (Dims_ != Dims) {
133+
throw std::runtime_error(
134+
"Dimensionality of cluster, global and local ranges must be same");
135+
}
136+
137+
for (int I = 0; I < Dims_; ++I) {
138+
ClusterDimensions[I] = N[I];
139+
}
140+
}
141+
131142
sycl::range<3> GlobalSize;
132143
sycl::range<3> LocalSize;
133144
sycl::id<3> GlobalOffset;
134145
/// Number of workgroups, used to record the number of workgroups from the
135146
/// simplest form of parallel_for_work_group. If set, all other fields must be
136147
/// zero
137148
sycl::range<3> NumWorkGroups;
149+
sycl::range<3> ClusterDimensions{1, 1, 1};
138150
size_t Dims;
139151
};
140152

sycl/include/sycl/detail/pi.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,4 +230,7 @@ _PI_API(piextVirtualMemGetInfo)
230230
// Enqueue native command
231231
_PI_API(piextEnqueueNativeCommand)
232232

233+
// Kernel Launch Properties
234+
_PI_API(piextEnqueueKernelLaunchCustom)
235+
233236
#undef _PI_API

sycl/include/sycl/detail/pi.h

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,9 +197,11 @@
197197
// pi_virtual_access_flags bit flags.
198198
// 15.55 Added piextEnqueueNativeCommand as well as associated types and enums
199199
// 16.56 Replaced piextUSMEnqueueMemset with piextUSMEnqueueFill
200+
// 16.57 Added mappings to UR launch properties extension
201+
// (piextEnqueueKernelLaunchCustom)
200202

201203
#define _PI_H_VERSION_MAJOR 16
202-
#define _PI_H_VERSION_MINOR 56
204+
#define _PI_H_VERSION_MINOR 57
203205

204206
#define _PI_STRING_HELPER(a) #a
205207
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -514,8 +516,12 @@ typedef enum {
514516

515517
// Virtual memory support
516518
PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM = 0x2011E,
519+
517520
// Native enqueue
518521
PI_EXT_ONEAPI_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT = 0x2011F,
522+
523+
// Return whether cluster launch is supported by device
524+
PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH = 0x2021,
519525
} _pi_device_info;
520526

521527
typedef enum {
@@ -1317,8 +1323,28 @@ typedef enum {
13171323
///< P2P link, otherwise such operations are not supported.
13181324
} _pi_peer_attr;
13191325

1326+
typedef enum {
1327+
PI_LAUNCH_PROPERTY_IGNORE = 0x0,
1328+
PI_LAUNCH_PROPERTY_COOPERATIVE = 0x1,
1329+
PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION = 0x2,
1330+
} _pi_launch_property_id;
1331+
1332+
typedef union {
1333+
int cooperative;
1334+
int32_t cluster_dims[3];
1335+
} _pi_launch_property_value;
1336+
13201337
using pi_mem_info = _pi_mem_info;
13211338
using pi_peer_attr = _pi_peer_attr;
1339+
using pi_launch_property_id = _pi_launch_property_id;
1340+
using pi_launch_property_value = _pi_launch_property_value;
1341+
1342+
typedef struct {
1343+
pi_launch_property_id id;
1344+
pi_launch_property_value value;
1345+
} _pi_launch_property;
1346+
1347+
using pi_launch_property = _pi_launch_property;
13221348

13231349
//
13241350
// Following section contains SYCL RT Plugin Interface (PI) functions.
@@ -1933,6 +1959,14 @@ __SYCL_EXPORT pi_result piextEnqueueCooperativeKernelLaunch(
19331959
const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
19341960
const pi_event *event_wait_list, pi_event *event);
19351961

1962+
__SYCL_EXPORT pi_result piextEnqueueKernelLaunchCustom(
1963+
pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
1964+
const size_t *global_work_size, const size_t *local_work_size,
1965+
pi_uint32 num_props_in_launch_prop_list,
1966+
const pi_launch_property *launch_prop_list,
1967+
pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
1968+
pi_event *event);
1969+
19361970
__SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
19371971
pi_uint32 num_events_in_wait_list,
19381972
const pi_event *event_wait_list,

sycl/include/sycl/detail/pi.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ using PiImageOffset = ::pi_image_offset_struct;
148148
using PiImageRegion = ::pi_image_region_struct;
149149
using PiPhysicalMem = ::pi_physical_mem;
150150
using PiVirtualAccessFlags = ::pi_virtual_access_flags;
151+
using PiLaunchProperty = ::pi_launch_property;
151152

152153
__SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext,
153154
pi_context_extended_deleter func,

sycl/include/sycl/device_aspect_macros.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,11 @@
381381
#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_virtual_mem__ 0
382382
#endif
383383

384+
#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_cuda_cluster_group__
385+
// __SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75)
386+
#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_cuda_cluster_group__ 0
387+
#endif
388+
384389
#ifndef __SYCL_ANY_DEVICE_HAS_host__
385390
// __SYCL_ASPECT(host, 0)
386391
#define __SYCL_ANY_DEVICE_HAS_host__ 0
@@ -750,3 +755,8 @@
750755
// __SYCL_ASPECT(ext_oneapi_virtual_mem, 74)
751756
#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_virtual_mem__ 0
752757
#endif
758+
759+
#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_cuda_cluster_group__
760+
// __SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75)
761+
#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_cuda_cluster_group__ 0
762+
#endif
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
//==--- cluster_group_prop.hpp --- SYCL extension for cuda cluster group ---==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include <sycl/ext/oneapi/properties/properties.hpp>
12+
#include <sycl/range.hpp>
13+
14+
namespace sycl {
15+
inline namespace _V1 {
16+
namespace ext::oneapi::experimental {
17+
18+
namespace cuda {
19+
template <int Dim>
20+
struct cluster_size
21+
: ::sycl::ext::oneapi::experimental::detail::run_time_property_key<
22+
::sycl::ext::oneapi::experimental::detail::ClusterLaunch> {
23+
cluster_size(const range<Dim> &size) : size(size) {}
24+
sycl::range<Dim> get_cluster_size() { return size; }
25+
26+
private:
27+
range<Dim> size;
28+
};
29+
30+
template <int Dim> using cluster_size_key = cluster_size<Dim>;
31+
32+
} // namespace cuda
33+
34+
template <>
35+
struct is_property_key<cuda::cluster_size_key<1>> : std::true_type {};
36+
template <>
37+
struct is_property_key<cuda::cluster_size_key<2>> : std::true_type {};
38+
template <>
39+
struct is_property_key<cuda::cluster_size_key<3>> : std::true_type {};
40+
41+
template <typename T>
42+
struct is_property_key_of<cuda::cluster_size_key<1>, T> : std::true_type {};
43+
44+
template <typename T>
45+
struct is_property_key_of<cuda::cluster_size_key<2>, T> : std::true_type {};
46+
47+
template <typename T>
48+
struct is_property_key_of<cuda::cluster_size_key<3>, T> : std::true_type {};
49+
50+
template <>
51+
struct is_property_value<cuda::cluster_size_key<1>>
52+
: is_property_key<cuda::cluster_size_key<1>> {};
53+
template <>
54+
struct is_property_value<cuda::cluster_size_key<2>>
55+
: is_property_key<cuda::cluster_size_key<2>> {};
56+
template <>
57+
struct is_property_value<cuda::cluster_size_key<3>>
58+
: is_property_key<cuda::cluster_size_key<3>> {};
59+
60+
template <typename O>
61+
struct is_property_value_of<cuda::cluster_size_key<1>, O>
62+
: is_property_key_of<cuda::cluster_size_key<1>, O> {};
63+
64+
template <typename O>
65+
struct is_property_value_of<cuda::cluster_size_key<2>, O>
66+
: is_property_key_of<cuda::cluster_size_key<2>, O> {};
67+
68+
template <typename O>
69+
struct is_property_value_of<cuda::cluster_size_key<3>, O>
70+
: is_property_key_of<cuda::cluster_size_key<3>, O> {};
71+
72+
namespace detail {
73+
template <typename PropertiesT> constexpr std::size_t getClusterDim() {
74+
if constexpr (PropertiesT::template has_property<
75+
sycl::ext::oneapi::experimental::cuda::cluster_size_key<
76+
1>>()) {
77+
return 1;
78+
}
79+
if constexpr (PropertiesT::template has_property<
80+
sycl::ext::oneapi::experimental::cuda::cluster_size_key<
81+
2>>()) {
82+
return 2;
83+
}
84+
if constexpr (PropertiesT::template has_property<
85+
sycl::ext::oneapi::experimental::cuda::cluster_size_key<
86+
3>>()) {
87+
return 3;
88+
}
89+
return 0;
90+
}
91+
} // namespace detail
92+
} // namespace ext::oneapi::experimental
93+
} // namespace _V1
94+
} // namespace sycl

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,9 @@ void parallel_for(handler &CGH,
149149
ext::oneapi::experimental::detail::LaunchConfigAccess<range<Dimensions>,
150150
Properties>
151151
ConfigAccess(Config);
152-
CGH.parallel_for<KernelName>(ConfigAccess.getRange(),
153-
std::forward<ReductionsT>(Reductions)...,
154-
KernelObj);
152+
CGH.parallel_for<KernelName>(
153+
ConfigAccess.getRange(), ConfigAccess.getProperties(),
154+
std::forward<ReductionsT>(Reductions)..., KernelObj);
155155
}
156156

157157
template <typename KernelName = sycl::detail::auto_name, int Dimensions,
@@ -225,9 +225,9 @@ void nd_launch(handler &CGH,
225225
ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
226226
Properties>
227227
ConfigAccess(Config);
228-
CGH.parallel_for<KernelName>(ConfigAccess.getRange(),
229-
std::forward<ReductionsT>(Reductions)...,
230-
KernelObj);
228+
CGH.parallel_for<KernelName>(
229+
ConfigAccess.getRange(), ConfigAccess.getProperties(),
230+
std::forward<ReductionsT>(Reductions)..., KernelObj);
231231
}
232232

233233
template <typename KernelName = sycl::detail::auto_name, int Dimensions,

sycl/include/sycl/ext/oneapi/experimental/graph.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ enum class UnsupportedGraphFeatures {
5353
sycl_ext_oneapi_enqueue_barrier = 4,
5454
sycl_ext_oneapi_memcpy2d = 5,
5555
sycl_ext_oneapi_device_global = 6,
56-
sycl_ext_oneapi_bindless_images = 7
56+
sycl_ext_oneapi_bindless_images = 7,
57+
sycl_ext_oneapi_experimental_cuda_cluster_launch = 8
5758
};
5859

5960
inline const char *
@@ -76,6 +77,8 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) {
7677
return "sycl_ext_oneapi_device_global";
7778
case UGF::sycl_ext_oneapi_bindless_images:
7879
return "sycl_ext_oneapi_bindless_images";
80+
case UGF::sycl_ext_oneapi_experimental_cuda_cluster_launch:
81+
return "sycl_ext_oneapi_experimental_cuda_cluster_launch";
7982
}
8083

8184
assert(false && "Unhandled graphs feature");

sycl/include/sycl/ext/oneapi/properties/property.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,9 @@ enum PropKind : uint32_t {
211211
OutputDataPlacement = 70,
212212
IncludeFiles = 71,
213213
RegisteredKernelNames = 72,
214+
ClusterLaunch = 73,
214215
// PropKindSize must always be the last value.
215-
PropKindSize = 73,
216+
PropKindSize = 74,
216217
};
217218

218219
struct property_key_base_tag {};

0 commit comments

Comments
 (0)