|
| 1 | +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %cuda_options -lcudart -lcuda -x cuda %s -o %t.out |
| 2 | +// RUN: %GPU_RUN_PLACEHOLDER %t.out |
| 3 | +// REQUIRES: cuda && cuda_dev_kit |
| 4 | + |
| 5 | +#include <cuda.h> |
| 6 | +#include <sycl/sycl.hpp> |
| 7 | + |
| 8 | +// ------------------------------------------------------------------------- // |
| 9 | + |
| 10 | +// device-sum: test_cuda_function_0 = X |
| 11 | +// host-sum: test_cuda_function_0 = -X |
| 12 | +template <typename T> __device__ T test_cuda_function_0(T a, T b) { |
| 13 | + return a + b; |
| 14 | +} |
| 15 | +template <typename T> __host__ T test_cuda_function_0(T a, T b) { |
| 16 | + return -a - b; |
| 17 | +} |
| 18 | + |
| 19 | +// device-sum: test_cuda_function_2 + test_cuda_function_3 = 0 |
| 20 | +// host-sum: test_cuda_function_3 = 0 |
| 21 | +__device__ inline float test_cuda_function_2(float a, float b) { |
| 22 | + return -sin(a) + b; |
| 23 | +} |
| 24 | +__device__ inline float test_cuda_function_3(float a, float b) { |
| 25 | + return sin(a) - b; |
| 26 | +} |
| 27 | +__host__ inline float test_cuda_function_3(float a, float b) { return 0; } |
| 28 | + |
| 29 | +// device/host-sum: test_cuda_function_4 = 0 |
| 30 | +__device__ __host__ inline float test_cuda_function_4(float a, float b) { |
| 31 | + return (a - b) + (b - a); |
| 32 | +} |
| 33 | + |
| 34 | +// device-sum: test_cuda_function_5 + test_cuda_function_6 = 0 |
| 35 | +// host-sum: test_cuda_function_5 + test_cuda_function_6 = 0 |
| 36 | +__host__ inline float test_cuda_function_5(float a, float b) { return 1.0f; } |
| 37 | +__device__ inline float test_cuda_function_5(float a, float b) { |
| 38 | + return -a + cos(b); |
| 39 | +} |
| 40 | +__host__ float test_cuda_function_6(float a, float b) { return -1.0f; } |
| 41 | +__device__ float test_cuda_function_6(float a, float b) { return a - cos(b); } |
| 42 | + |
| 43 | +// Test the correct emission of __host__/__device__ functions for the host and |
| 44 | +// sycl-device compilation by verifying that b_host = -b_dev. |
| 45 | +void test_cuda_function_selection(sycl::queue &q) { |
| 46 | + |
| 47 | + const int n0 = 512; |
| 48 | + const sycl::range<1> r0{n0}; |
| 49 | + |
| 50 | + sycl::buffer<float, 1> b_a{n0}, b_b{n0}, b_host{n0}, b_dev{n0}; |
| 51 | + |
| 52 | + { |
| 53 | + sycl::host_accessor a{b_a, sycl::write_only}; |
| 54 | + sycl::host_accessor b{b_b, sycl::write_only}; |
| 55 | + sycl::host_accessor c{b_host, sycl::write_only}; |
| 56 | + |
| 57 | + for (size_t i = 0; i < n0; i++) { |
| 58 | + a[i] = sin(i) * sin(i); |
| 59 | + b[i] = cos(i) * cos(i); |
| 60 | + c[i] = test_cuda_function_0(a[i], b[i]) + //<-- __host__ |
| 61 | + test_cuda_function_3(a[i], b[i]) + //<-- __host__ |
| 62 | + test_cuda_function_4(a[i], b[i]) + //<-- __host__ __device__ |
| 63 | + (test_cuda_function_5(a[i], b[i]) + //<-- __host__ |
| 64 | + test_cuda_function_6(a[i], b[i])); //<-- __host__ |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + q.submit([&](sycl::handler &h) { |
| 69 | + sycl::accessor a{b_a, h, sycl::read_only}; |
| 70 | + sycl::accessor b{b_b, h, sycl::read_only}; |
| 71 | + sycl::accessor c{b_dev, h, sycl::write_only}; |
| 72 | + |
| 73 | + h.parallel_for(r0, [=](sycl::id<1> i) { |
| 74 | + c[i] = test_cuda_function_0(a[i], b[i]) + //<-- __device__ |
| 75 | + (test_cuda_function_2(a[i], b[i]) + //<-- __device__ |
| 76 | + test_cuda_function_3(a[i], b[i])) + //<-- __device__ |
| 77 | + test_cuda_function_4(a[i], b[i]) + //<-- __host__ __device__ |
| 78 | + (test_cuda_function_5(a[i], b[i]) + //<-- __device__ |
| 79 | + test_cuda_function_6(a[i], b[i])); //<-- __device__ |
| 80 | + }); |
| 81 | + }); |
| 82 | + |
| 83 | + { |
| 84 | + sycl::host_accessor c1{b_host, sycl::read_only}; |
| 85 | + sycl::host_accessor c2{b_dev, sycl::read_only}; |
| 86 | + for (size_t i = 0; i < n0; i++) { |
| 87 | + // b_host = -1 b_dev |
| 88 | + assert((c1[i] + c2[i] < 1e-5) && "Results mismatch!"); |
| 89 | + } |
| 90 | + } |
| 91 | +} |
| 92 | + |
| 93 | +// ------------------------------------------------------------------------- // |
| 94 | + |
| 95 | +__device__ int test_cuda_function_1() { |
| 96 | + return blockIdx.x * blockDim.x + threadIdx.x; |
| 97 | +} |
| 98 | + |
| 99 | +__global__ void test_cuda_kernel(int *out) { |
| 100 | + int i = blockIdx.x * blockDim.x + threadIdx.x; |
| 101 | + out[i] = i - test_cuda_function_1(); |
| 102 | +} |
| 103 | + |
| 104 | +// Test CUDA kernel launch and CUDA API. |
| 105 | +void test_cuda_kernel_launch() { |
| 106 | + // CUDA |
| 107 | + const int n = 512; |
| 108 | + std::vector<int> result(n, -1); |
| 109 | + int *cuda_kern_result = NULL; |
| 110 | + |
| 111 | + int block_size = 128; |
| 112 | + dim3 dimBlock(block_size, 1, 1); |
| 113 | + dim3 dimGrid(n / block_size, 1, 1); |
| 114 | + |
| 115 | + cudaMalloc((void **)&cuda_kern_result, n * sizeof(int)); |
| 116 | + |
| 117 | + test_cuda_kernel<<<n / block_size, block_size>>>(cuda_kern_result); |
| 118 | + |
| 119 | + cudaError_t error = cudaGetLastError(); |
| 120 | + if (error != cudaSuccess) |
| 121 | + std::cerr << "CUDA ERROR: " << error << " " << cudaGetErrorString(error) |
| 122 | + << std::endl; |
| 123 | + |
| 124 | + cudaMemcpy(result.data(), cuda_kern_result, n * sizeof(int), |
| 125 | + cudaMemcpyDeviceToHost); |
| 126 | + |
| 127 | + for (size_t i = 0; i < n; i++) |
| 128 | + assert((0 == result[i]) && "Kernel execution fail!"); |
| 129 | + |
| 130 | + cudaFree(cuda_kern_result); |
| 131 | +} |
| 132 | + |
| 133 | +// ------------------------------------------------------------------------- // |
| 134 | + |
| 135 | +__host__ float test_cuda_function_7() { return -1; } |
| 136 | +__device__ float test_cuda_function_7() { return 1; } |
| 137 | + |
| 138 | +__host__ float test_cuda_function_8() { return 3; } |
| 139 | + |
| 140 | +__device__ float test_cuda_function_9() { return 9; } |
| 141 | + |
| 142 | +int test_regular_function_0() { return test_cuda_function_7(); } |
| 143 | + |
| 144 | +int test_regular_function_1() { return test_cuda_function_8(); } |
| 145 | + |
| 146 | +int test_regular_function_2() { return test_cuda_function_9(); } |
| 147 | + |
| 148 | +// Test the correct emission of __device__/__host__ function when called by |
| 149 | +// regular functions. |
| 150 | +void test_regular_functions(sycl::queue &q) { |
| 151 | + |
| 152 | + // regular func must returning the __host__ one (so, 1.0f) |
| 153 | + assert((test_regular_function_0() == -1) && |
| 154 | + "Mismatch regular func to __host__"); |
| 155 | + assert((test_regular_function_1() == 3) && |
| 156 | + "Mismatch regular func to __host__"); |
| 157 | + |
| 158 | + sycl::buffer<int, 1> b_r{3}; |
| 159 | + q.submit([&](sycl::handler &h) { |
| 160 | + sycl::accessor r{b_r, h, sycl::write_only}; |
| 161 | + |
| 162 | + h.single_task([=]() { |
| 163 | + r[0] = test_regular_function_0(); //<-- points to __device__ |
| 164 | + r[1] = test_regular_function_1(); //<-- points to __host__ |
| 165 | + r[2] = test_regular_function_2(); //<-- points to __device__ |
| 166 | + }); |
| 167 | + }); |
| 168 | + |
| 169 | + sycl::host_accessor r{b_r, sycl::read_only}; |
| 170 | + assert((r[0] == 1) && "Mismatch regular func to __device__"); |
| 171 | + assert((r[1] == 3) && "Mismatch regular func to __host__"); |
| 172 | + assert((r[2] == 9) && "Mismatch regular func to __device__"); |
| 173 | +} |
| 174 | + |
| 175 | +// ------------------------------------------------------------------------- // |
| 176 | + |
| 177 | +// Tests the result of a function that calls CUDA device builtins. |
| 178 | +void test_ids(sycl::queue &q) { |
| 179 | + |
| 180 | + const size_t n1 = 2048; |
| 181 | + const sycl::range<1> r1{n1}; |
| 182 | + sycl::buffer<int, 1> b_idx{n1}; |
| 183 | + q.submit([&](sycl::handler &h) { |
| 184 | + sycl::accessor d_idx{b_idx, h, sycl::write_only}; |
| 185 | + |
| 186 | + h.parallel_for(r1, |
| 187 | + [=](sycl::id<1> i) { d_idx[i] = test_cuda_function_1(); }); |
| 188 | + }); |
| 189 | + |
| 190 | + sycl::host_accessor h_idx{b_idx, sycl::read_only}; |
| 191 | + for (size_t i = 0; i < n1; i++) |
| 192 | + assert((i == h_idx[i]) && "CUDA index mismatch!"); |
| 193 | +} |
| 194 | + |
| 195 | +// ------------------------------------------------------------------------- // |
| 196 | + |
| 197 | +int main(int argc, char **argv) { |
| 198 | + |
| 199 | + sycl::queue q{sycl::gpu_selector_v}; |
| 200 | + |
| 201 | + test_cuda_function_selection(q); |
| 202 | + test_cuda_kernel_launch(); |
| 203 | + test_regular_functions(q); |
| 204 | + test_ids(q); |
| 205 | + |
| 206 | + return 0; |
| 207 | +} |
0 commit comments