Skip to content

Commit a346c08

Browse files
authored
[SYCL][CUDA] Update program manager and queue to resolve multi-targeting issues (#4921)
This PR makes two changes, the first is it moves the macro which prevents `__devicelib_assert_read` being used for `nvptx64` devices. This is done to resolve an issue where the binary images of `spirv64` and `nvptx64` are neither identical nor disjoint (have no kernels in common). The program manager needs binary images to be identical or disjoint, it fails otherwise. This creates a kernel of the same name as when building for `spirv64` but it does not use `__devicelib_assert_read`. The second it prevents errors being thrown in the program manager when the binaries compatibility check returns false. This is to allow for multi-targeting to be used with module splitting. A cuda and hip only regression test is added to check for successful compilation with multi-targeting and module splitting options.
1 parent 59fcb82 commit a346c08

File tree

3 files changed

+47
-4
lines changed

3 files changed

+47
-4
lines changed

sycl/include/CL/sycl/queue.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767

6868
// Helper macro to identify if fallback assert is needed
6969
// FIXME remove __NVPTX__ condition once devicelib supports CUDA
70-
#if !defined(SYCL_DISABLE_FALLBACK_ASSERT) && !defined(__NVPTX__)
70+
#if !defined(SYCL_DISABLE_FALLBACK_ASSERT)
7171
#define __SYCL_USE_FALLBACK_ASSERT 1
7272
#else
7373
#define __SYCL_USE_FALLBACK_ASSERT 0
@@ -1187,11 +1187,11 @@ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
11871187
auto Acc = Buffer.get_access<access::mode::write>(CGH);
11881188

11891189
CGH.single_task<__sycl_service_kernel__::AssertInfoCopier>([Acc] {
1190-
#ifdef __SYCL_DEVICE_ONLY__
1190+
#if defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
11911191
__devicelib_assert_read(&Acc[0]);
11921192
#else
11931193
(void)Acc;
1194-
#endif // __SYCL_DEVICE_ONLY__
1194+
#endif // defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
11951195
});
11961196
};
11971197
auto CheckerCGF = [&CopierEv, &Buffer](handler &CGH) {

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1313,9 +1313,12 @@ static bool compatibleWithDevice(RTDeviceBinaryImage *BinImage,
13131313
pi_uint32 SuitableImageID = std::numeric_limits<pi_uint32>::max();
13141314
pi_device_binary DevBin =
13151315
const_cast<pi_device_binary>(&BinImage->getRawData());
1316-
Plugin.call<PiApiKind::piextDeviceSelectBinary>(
1316+
RT::PiResult Error = Plugin.call_nocheck<PiApiKind::piextDeviceSelectBinary>(
13171317
PIDeviceHandle, &DevBin,
13181318
/*num bin images = */ (cl_uint)1, &SuitableImageID);
1319+
if (Error != PI_SUCCESS && Error != PI_INVALID_BINARY)
1320+
throw runtime_error("Invalid binary image or device", PI_INVALID_VALUE);
1321+
13191322
return (0 == SuitableImageID);
13201323
}
13211324

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// REQUIRES: cuda || hip_be
2+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple,spir64 %s -o -
3+
// RUN: %clangxx -fsycl -fsycl-targets=spir64,%sycl_triple %s -o -
4+
//
5+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple,spir64 %s -o -
6+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64,%sycl_triple %s -o -
7+
//
8+
// Test checks that compiling for multiple devices works regardless of target
9+
// order.
10+
11+
#include <sycl/sycl.hpp>
12+
13+
using namespace cl::sycl;
14+
15+
int main() {
16+
sycl::queue q;
17+
18+
float A_Data[5] = {1.1};
19+
float B_Data[5] = {0};
20+
int C_Data[10] = {0};
21+
22+
{
23+
buffer<float, 1> A_buff(A_Data, range<1>(5));
24+
buffer<float, 1> B_buff(B_Data, range<1>(5));
25+
q.submit([&](handler &cgh) {
26+
auto A_acc = A_buff.get_access<access::mode::read>(cgh);
27+
auto B_acc = B_buff.get_access<access::mode::write>(cgh);
28+
cgh.parallel_for(range<1>{5},
29+
[=](id<1> index) { B_acc[index] = A_acc[index]; });
30+
}).wait();
31+
}
32+
33+
{
34+
buffer<int, 1> C_buff(C_Data, range<1>(10));
35+
q.submit([&](handler &cgh) {
36+
auto C_acc = C_buff.get_access<access::mode::write>(cgh);
37+
cgh.parallel_for(range<1>{10}, [=](id<1> index) { C_acc[index] = 15; });
38+
}).wait();
39+
}
40+
}

0 commit comments

Comments
 (0)