Skip to content

Commit 1c9b3de

Browse files
pgorlaniaelovikov-intel
authored andcommitted
[SYCL][CUDA] Test CUDA and SYCL in the same TU (intel#1377)
Test the compilation of CUDA and SYCL codes in the same translation unit. Co-authored-by: aelovikov-intel <[email protected]>
1 parent d246802 commit 1c9b3de

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %cuda_options -lcudart -lcuda -x cuda %s -o %t.out
2+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
3+
// REQUIRES: cuda && cuda_dev_kit
4+
5+
#include <cuda.h>
6+
#include <sycl/sycl.hpp>
7+
8+
// ------------------------------------------------------------------------- //
9+
10+
// device-sum: test_cuda_function_0 = X
11+
// host-sum: test_cuda_function_0 = -X
12+
template <typename T> __device__ T test_cuda_function_0(T a, T b) {
13+
return a + b;
14+
}
15+
template <typename T> __host__ T test_cuda_function_0(T a, T b) {
16+
return -a - b;
17+
}
18+
19+
// device-sum: test_cuda_function_2 + test_cuda_function_3 = 0
20+
// host-sum: test_cuda_function_3 = 0
21+
__device__ inline float test_cuda_function_2(float a, float b) {
22+
return -sin(a) + b;
23+
}
24+
__device__ inline float test_cuda_function_3(float a, float b) {
25+
return sin(a) - b;
26+
}
27+
__host__ inline float test_cuda_function_3(float a, float b) { return 0; }
28+
29+
// device/host-sum: test_cuda_function_4 = 0
30+
__device__ __host__ inline float test_cuda_function_4(float a, float b) {
31+
return (a - b) + (b - a);
32+
}
33+
34+
// device-sum: test_cuda_function_5 + test_cuda_function_6 = 0
35+
// host-sum: test_cuda_function_5 + test_cuda_function_6 = 0
36+
__host__ inline float test_cuda_function_5(float a, float b) { return 1.0f; }
37+
__device__ inline float test_cuda_function_5(float a, float b) {
38+
return -a + cos(b);
39+
}
40+
__host__ float test_cuda_function_6(float a, float b) { return -1.0f; }
41+
__device__ float test_cuda_function_6(float a, float b) { return a - cos(b); }
42+
43+
// Test the correct emission of __host__/__device__ functions for the host and
44+
// sycl-device compilation by verifying that b_host = -b_dev.
45+
void test_cuda_function_selection(sycl::queue &q) {
46+
47+
const int n0 = 512;
48+
const sycl::range<1> r0{n0};
49+
50+
sycl::buffer<float, 1> b_a{n0}, b_b{n0}, b_host{n0}, b_dev{n0};
51+
52+
{
53+
sycl::host_accessor a{b_a, sycl::write_only};
54+
sycl::host_accessor b{b_b, sycl::write_only};
55+
sycl::host_accessor c{b_host, sycl::write_only};
56+
57+
for (size_t i = 0; i < n0; i++) {
58+
a[i] = sin(i) * sin(i);
59+
b[i] = cos(i) * cos(i);
60+
c[i] = test_cuda_function_0(a[i], b[i]) + //<-- __host__
61+
test_cuda_function_3(a[i], b[i]) + //<-- __host__
62+
test_cuda_function_4(a[i], b[i]) + //<-- __host__ __device__
63+
(test_cuda_function_5(a[i], b[i]) + //<-- __host__
64+
test_cuda_function_6(a[i], b[i])); //<-- __host__
65+
}
66+
}
67+
68+
q.submit([&](sycl::handler &h) {
69+
sycl::accessor a{b_a, h, sycl::read_only};
70+
sycl::accessor b{b_b, h, sycl::read_only};
71+
sycl::accessor c{b_dev, h, sycl::write_only};
72+
73+
h.parallel_for(r0, [=](sycl::id<1> i) {
74+
c[i] = test_cuda_function_0(a[i], b[i]) + //<-- __device__
75+
(test_cuda_function_2(a[i], b[i]) + //<-- __device__
76+
test_cuda_function_3(a[i], b[i])) + //<-- __device__
77+
test_cuda_function_4(a[i], b[i]) + //<-- __host__ __device__
78+
(test_cuda_function_5(a[i], b[i]) + //<-- __device__
79+
test_cuda_function_6(a[i], b[i])); //<-- __device__
80+
});
81+
});
82+
83+
{
84+
sycl::host_accessor c1{b_host, sycl::read_only};
85+
sycl::host_accessor c2{b_dev, sycl::read_only};
86+
for (size_t i = 0; i < n0; i++) {
87+
// b_host = -1 b_dev
88+
assert((c1[i] + c2[i] < 1e-5) && "Results mismatch!");
89+
}
90+
}
91+
}
92+
93+
// ------------------------------------------------------------------------- //
94+
95+
__device__ int test_cuda_function_1() {
96+
return blockIdx.x * blockDim.x + threadIdx.x;
97+
}
98+
99+
__global__ void test_cuda_kernel(int *out) {
100+
int i = blockIdx.x * blockDim.x + threadIdx.x;
101+
out[i] = i - test_cuda_function_1();
102+
}
103+
104+
// Test CUDA kernel launch and CUDA API.
105+
void test_cuda_kernel_launch() {
106+
// CUDA
107+
const int n = 512;
108+
std::vector<int> result(n, -1);
109+
int *cuda_kern_result = NULL;
110+
111+
int block_size = 128;
112+
dim3 dimBlock(block_size, 1, 1);
113+
dim3 dimGrid(n / block_size, 1, 1);
114+
115+
cudaMalloc((void **)&cuda_kern_result, n * sizeof(int));
116+
117+
test_cuda_kernel<<<n / block_size, block_size>>>(cuda_kern_result);
118+
119+
cudaError_t error = cudaGetLastError();
120+
if (error != cudaSuccess)
121+
std::cerr << "CUDA ERROR: " << error << " " << cudaGetErrorString(error)
122+
<< std::endl;
123+
124+
cudaMemcpy(result.data(), cuda_kern_result, n * sizeof(int),
125+
cudaMemcpyDeviceToHost);
126+
127+
for (size_t i = 0; i < n; i++)
128+
assert((0 == result[i]) && "Kernel execution fail!");
129+
130+
cudaFree(cuda_kern_result);
131+
}
132+
133+
// ------------------------------------------------------------------------- //
134+
135+
__host__ float test_cuda_function_7() { return -1; }
136+
__device__ float test_cuda_function_7() { return 1; }
137+
138+
__host__ float test_cuda_function_8() { return 3; }
139+
140+
__device__ float test_cuda_function_9() { return 9; }
141+
142+
int test_regular_function_0() { return test_cuda_function_7(); }
143+
144+
int test_regular_function_1() { return test_cuda_function_8(); }
145+
146+
int test_regular_function_2() { return test_cuda_function_9(); }
147+
148+
// Test the correct emission of __device__/__host__ function when called by
149+
// regular functions.
150+
void test_regular_functions(sycl::queue &q) {
151+
152+
// regular func must returning the __host__ one (so, 1.0f)
153+
assert((test_regular_function_0() == -1) &&
154+
"Mismatch regular func to __host__");
155+
assert((test_regular_function_1() == 3) &&
156+
"Mismatch regular func to __host__");
157+
158+
sycl::buffer<int, 1> b_r{3};
159+
q.submit([&](sycl::handler &h) {
160+
sycl::accessor r{b_r, h, sycl::write_only};
161+
162+
h.single_task([=]() {
163+
r[0] = test_regular_function_0(); //<-- points to __device__
164+
r[1] = test_regular_function_1(); //<-- points to __host__
165+
r[2] = test_regular_function_2(); //<-- points to __device__
166+
});
167+
});
168+
169+
sycl::host_accessor r{b_r, sycl::read_only};
170+
assert((r[0] == 1) && "Mismatch regular func to __device__");
171+
assert((r[1] == 3) && "Mismatch regular func to __host__");
172+
assert((r[2] == 9) && "Mismatch regular func to __device__");
173+
}
174+
175+
// ------------------------------------------------------------------------- //
176+
177+
// Tests the result of a function that calls CUDA device builtins.
178+
void test_ids(sycl::queue &q) {
179+
180+
const size_t n1 = 2048;
181+
const sycl::range<1> r1{n1};
182+
sycl::buffer<int, 1> b_idx{n1};
183+
q.submit([&](sycl::handler &h) {
184+
sycl::accessor d_idx{b_idx, h, sycl::write_only};
185+
186+
h.parallel_for(r1,
187+
[=](sycl::id<1> i) { d_idx[i] = test_cuda_function_1(); });
188+
});
189+
190+
sycl::host_accessor h_idx{b_idx, sycl::read_only};
191+
for (size_t i = 0; i < n1; i++)
192+
assert((i == h_idx[i]) && "CUDA index mismatch!");
193+
}
194+
195+
// ------------------------------------------------------------------------- //
196+
197+
int main(int argc, char **argv) {
198+
199+
sycl::queue q{sycl::gpu_selector_v};
200+
201+
test_cuda_function_selection(q);
202+
test_cuda_kernel_launch();
203+
test_regular_functions(q);
204+
test_ids(q);
205+
206+
return 0;
207+
}

0 commit comments

Comments
 (0)