Skip to content

Commit ac0e692

Browse files
authored
[SYCL][NFC] Increase test coverage for inline asm feature (#2124)
This PR adds inline_asm feature tests with branching and loop. Signed-off-by: Aleksander Fadeev [email protected]
1 parent b6d7792 commit ac0e692

File tree

4 files changed

+209
-1
lines changed

4 files changed

+209
-1
lines changed

sycl/test/inline-asm/asm_if.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %GPU_RUN_PLACEHOLDER %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
11+
using DataType = cl::sycl::cl_int;
12+
13+
template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
14+
KernelFunctor(size_t ProblemSize) : WithOutputBuffer<T>(ProblemSize) {}
15+
16+
void operator()(cl::sycl::handler &CGH) {
17+
auto C = this->getOutputBuffer()
18+
.template get_access<cl::sycl::access::mode::write>(CGH);
19+
bool switchField = false;
20+
// clang-format off
21+
CGH.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()},
23+
[=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
24+
// clang-format on
25+
int Output = 0;
26+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
27+
asm volatile(".decl P1 v_type=P num_elts=1\n"
28+
"cmp.eq (M1_NM, 8) P1 %1(0,0)<0;1,0> 0x0:b\n"
29+
"(P1) sel (M1_NM, 8) %0(0,0)<1> 0x7:d 0x8:d"
30+
: "=rw"(Output)
31+
: "rw"(switchField));
32+
33+
#else
34+
if (switchField == false)
35+
Output = 7;
36+
else
37+
Output = 8;
38+
#endif
39+
C[wiID] = Output;
40+
});
41+
}
42+
};
43+
44+
int main() {
45+
KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE);
46+
if (!launchInlineASMTest(Functor))
47+
return 0;
48+
49+
if (verify_all_the_same(Functor.getOutputBufferData(), 7))
50+
return 0;
51+
52+
return 1;
53+
}

sycl/test/inline-asm/asm_loop.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %GPU_RUN_PLACEHOLDER %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
#include <cmath>
11+
#include <iostream>
12+
#include <vector>
13+
14+
using DataType = cl::sycl::cl_int;
15+
16+
template <typename T = DataType>
17+
struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
18+
KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2)
19+
: WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(
20+
input1.size()) {}
21+
22+
void operator()(cl::sycl::handler &CGH) {
23+
auto A = this->getInputBuffer(0)
24+
.template get_access<cl::sycl::access::mode::read>(CGH);
25+
auto B = this->getInputBuffer(1)
26+
.template get_access<cl::sycl::access::mode::read>(CGH);
27+
auto C = this->getOutputBuffer()
28+
.template get_access<cl::sycl::access::mode::write>(CGH);
29+
// clang-format off
30+
CGH.parallel_for<KernelFunctor<T>>(
31+
cl::sycl::range<1>{this->getOutputBufferSize()},
32+
[=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
33+
// clang-format on
34+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
35+
asm volatile(".decl P1 v_type=P num_elts=8\n"
36+
".decl P2 v_type=P num_elts=8\n"
37+
".decl temp v_type=G type=d num_elts=8 align=dword\n"
38+
"mov (M1, 8) %0(0, 0)<1> 0x0:d\n"
39+
"cmp.le (M1, 8) P1 %1(0,0)<1;1,0> 0x0:d\n"
40+
"(P1) goto (M1, 8) label0\n"
41+
"mov (M1, 8) temp(0,0)<1> 0x0:d\n"
42+
"label1:\n"
43+
"add (M1, 8) temp(0,0)<1> temp(0,0)<1;1,0> 0x1:w\n"
44+
"add (M1, 8) %0(0,0)<1> %0(0,0)<1;1,0> %2(0,0)<1;1,0>\n"
45+
"cmp.lt (M1, 8) P2 temp(0,0)<0;8,1> %1(0,0)<0;8,1>\n"
46+
"(P2) goto (M1, 8) label1\n"
47+
"label0:"
48+
: "+rw"(C[wiID])
49+
: "rw"(A[wiID]), "rw"(B[wiID]));
50+
#else
51+
C[wiID] = 0;
52+
for (int i = 0; i < A[wiID]; ++i) {
53+
C[wiID] = C[wiID] + B[wiID];
54+
}
55+
#endif
56+
});
57+
}
58+
};
59+
60+
int main() {
61+
std::vector<DataType> InputA(DEFAULT_PROBLEM_SIZE),
62+
InputB(DEFAULT_PROBLEM_SIZE);
63+
for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
64+
InputA[i] = i;
65+
InputB[i] = 2 * i;
66+
}
67+
68+
KernelFunctor<> Functor(InputA, InputB);
69+
if (!launchInlineASMTest(Functor))
70+
return 0;
71+
72+
auto &C = Functor.getOutputBufferData();
73+
for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
74+
if (C[i] != InputA[i] * InputB[i]) {
75+
std::cerr << "At index: " << i << ". ";
76+
std::cerr << C[i] << " != " << InputA[i] * InputB[i] << "\n";
77+
return 1;
78+
}
79+
}
80+
81+
return 0;
82+
}

sycl/test/inline-asm/asm_switch.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// UNSUPPORTED: cuda
2+
// REQUIRES: gpu,linux
3+
// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %clangxx -fsycl %s -o %t.ref.out
6+
// RUN: %GPU_RUN_PLACEHOLDER %t.ref.out
7+
8+
#include "include/asmhelper.h"
9+
#include <CL/sycl.hpp>
10+
11+
using DataType = cl::sycl::cl_int;
12+
13+
template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
14+
KernelFunctor(size_t ProblemSize) : WithOutputBuffer<T>(ProblemSize) {}
15+
16+
void operator()(cl::sycl::handler &CGH) {
17+
auto C = this->getOutputBuffer()
18+
.template get_access<cl::sycl::access::mode::write>(CGH);
19+
int switchField = 2;
20+
// clang-format off
21+
CGH.parallel_for<KernelFunctor<T>>(
22+
cl::sycl::range<1>{this->getOutputBufferSize()},
23+
[=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
24+
// clang-format on
25+
int Output = 0;
26+
#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
27+
asm volatile(".decl P1 v_type=P num_elts=1\n"
28+
".decl P2 v_type=P num_elts=1\n"
29+
".decl P3 v_type=P num_elts=1\n"
30+
"cmp.ne (M1_NM, 8) P1 %1(0,0)<0;1,0> 0x0:d\n"
31+
"(P1) goto (M1, 1) label0\n"
32+
"mov (M1, 8) %0(0,0)<1> 0x9:d\n"
33+
"(P1) goto (M1, 1) label0\n"
34+
"label0:\n"
35+
"cmp.ne (M1_NM, 8) P2 %1(0,0)<0;1,0> 0x1:d\n"
36+
"(P2) goto (M1, 1) label1\n"
37+
"mov (M1, 8) %0(0,0)<1> 0x8:d\n"
38+
"label1:\n"
39+
"cmp.ne (M1_NM, 8) P3 %1(0,0)<0;1,0> 0x2:d\n"
40+
"(P3) goto (M1, 1) label2\n"
41+
"mov (M1, 8) %0(0,0)<1> 0x7:d\n"
42+
"label2:"
43+
: "=rw"(Output)
44+
: "rw"(switchField));
45+
46+
#else
47+
switch (switchField) {
48+
case 0:
49+
Output = 9;
50+
break;
51+
case 1:
52+
Output = 8;
53+
break;
54+
case 2:
55+
Output = 7;
56+
break;
57+
}
58+
#endif
59+
C[wiID] = Output;
60+
});
61+
}
62+
};
63+
64+
int main() {
65+
KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE);
66+
if (!launchInlineASMTest(Functor))
67+
return 0;
68+
69+
if (verify_all_the_same(Functor.getOutputBufferData(), 7))
70+
return 0;
71+
72+
return 1;
73+
}

sycl/test/inline-asm/include/asmhelper.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
105105
return false;
106106
}
107107

108-
deviceQueue.submit(f).wait();
108+
deviceQueue.submit(f).wait_and_throw();
109109
} catch (cl::sycl::exception &e) {
110110
std::cerr << "Caught exception: " << e.what() << std::endl;
111111
}

0 commit comments

Comments
 (0)