Skip to content

Commit 148d04d

Browse files
authored
[SYCL] Add tests for inline asm feature (#1444)
Signed-off-by: amochalo <[email protected]> Signed-off-by: Alexey Sachkov <[email protected]>
1 parent 4c07ff8 commit 148d04d

24 files changed

+1291
-1
lines changed

sycl/test/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ add_lit_testsuites(SYCL-DEPLOY ${CMAKE_CURRENT_SOURCE_DIR}
6161
EXCLUDE_FROM_CHECK_ALL
6262
)
6363

64+
add_lit_target(check-sycl-inline-asm
65+
"Running lit suite ${CMAKE_CURRENT_SOURCE_DIR}/feature-tests/inline-asm"
66+
"feature-tests/inline-asm"
67+
ARGS ${RT_TEST_ARGS}
68+
PARAMS "SYCL_BE=PI_OPENCL"
69+
DEPENDS ${SYCL_TEST_DEPS}
70+
)
71+
6472
if(SYCL_BUILD_PI_CUDA)
6573
add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA"
6674
${CMAKE_CURRENT_BINARY_DIR}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithOutputBuffer<T> {
17+
KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
21+
cgh.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
23+
C[wiID] = 43;
24+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
25+
asm volatile("");
26+
#endif
27+
});
28+
}
29+
};
30+
31+
int main() {
32+
KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
33+
if (!launchInlineASMTest(f))
34+
return 0;
35+
36+
if (verify_all_the_same(f.getOutputBufferData(), 43))
37+
return 0;
38+
39+
return 1;
40+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithOutputBuffer<T> {
17+
KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
21+
cgh.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
23+
volatile int output = 0;
24+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
25+
asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
26+
: "=rw"(output));
27+
#else
28+
output = 7;
29+
#endif
30+
C[wiID] = output;
31+
});
32+
}
33+
};
34+
35+
int main() {
36+
KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
37+
if (!launchInlineASMTest(f))
38+
return 0;
39+
40+
if (verify_all_the_same(f.getOutputBufferData(), 7))
41+
return 0;
42+
43+
return 1;
44+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithOutputBuffer<T> {
17+
KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
21+
cgh.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
23+
volatile int output = 0;
24+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
25+
asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
26+
: "=rw"(output));
27+
#else
28+
output = 7;
29+
#endif
30+
C[wiID] = output;
31+
});
32+
}
33+
};
34+
35+
int main() {
36+
KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
37+
if (!launchInlineASMTest(f))
38+
return 0;
39+
40+
if (verify_all_the_same(f.getOutputBufferData(), 7))
41+
return 0;
42+
43+
return 1;
44+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithOutputBuffer<T> {
17+
KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
21+
cgh.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
23+
for (int i = 0; i < 10; ++i) {
24+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
25+
asm("fence_sw");
26+
C[wiID] += i;
27+
28+
#else
29+
C[wiID] += i;
30+
#endif
31+
}
32+
});
33+
}
34+
};
35+
36+
int main() {
37+
KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
38+
if (!launchInlineASMTest(f))
39+
return 0;
40+
41+
if (verify_all_the_same(f.getOutputBufferData(), 45))
42+
return 0;
43+
44+
return 1;
45+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithOutputBuffer<T> {
17+
KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
21+
cgh.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
23+
C[wiID] = 43;
24+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
25+
asm volatile("");
26+
#endif
27+
});
28+
}
29+
};
30+
31+
int main() {
32+
KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
33+
if (!launchInlineASMTest(f))
34+
return 0;
35+
36+
if (verify_all_the_same(f.getOutputBufferData(), 43))
37+
return 0;
38+
39+
return 1;
40+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithOutputBuffer<T> {
17+
KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
21+
cgh.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
23+
volatile int output = 0;
24+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
25+
asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
26+
: "=rw"(output));
27+
#else
28+
output = 7;
29+
#endif
30+
C[wiID] = output;
31+
});
32+
}
33+
};
34+
35+
int main() {
36+
KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
37+
if (!launchInlineASMTest(f))
38+
return 0;
39+
40+
if (verify_all_the_same(f.getOutputBufferData(), 7))
41+
return 0;
42+
43+
return 1;
44+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <iostream>
11+
#include <vector>
12+
13+
using dataType = cl::sycl::cl_int;
14+
15+
template <typename T = dataType>
16+
struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
17+
KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
18+
19+
void operator()(cl::sycl::handler &cgh) {
20+
auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
21+
auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
22+
auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
23+
auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
24+
25+
cgh.parallel_for<KernelFunctor<T>>(
26+
cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
27+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
28+
asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
29+
: "=rw"(D[wiID])
30+
: "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
31+
#else
32+
D[wiID] = A[wiID] * B[wiID] + C[wiID];
33+
#endif
34+
});
35+
}
36+
};
37+
38+
int main() {
39+
std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
40+
for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
41+
inputA[i] = i;
42+
inputB[i] = i;
43+
inputC[i] = DEFAULT_PROBLEM_SIZE - i * i;
44+
}
45+
46+
KernelFunctor<> f(inputA, inputB, inputC);
47+
if (!launchInlineASMTest(f))
48+
return 0;
49+
50+
auto &D = f.getOutputBufferData();
51+
for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
52+
if (D[i] != inputA[i] * inputB[i] + inputC[i]) {
53+
std::cerr << "At index: " << i << ". ";
54+
std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n";
55+
return 1;
56+
}
57+
}
58+
return 0;
59+
}

0 commit comments

Comments
 (0)