Skip to content

Commit fbacbd6

Browse files
Merge master into reuse_dpctl_pow
2 parents 281fc21 + 0fd57d4 commit fbacbd6

31 files changed

+1389
-412
lines changed

.github/workflows/conda-package.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ env:
1515
test_arraycreation.py
1616
test_dot.py
1717
test_dparray.py
18+
test_copy.py
1819
test_fft.py
1920
test_linalg.py
2021
test_logic.py
@@ -23,9 +24,11 @@ env:
2324
test_random_state.py
2425
test_sort.py
2526
test_special.py
27+
test_sycl_queue.py
2628
test_umath.py
2729
test_usm_type.py
2830
third_party/cupy/linalg_tests/test_product.py
31+
third_party/cupy/logic_tests/test_comparison.py
2932
third_party/cupy/logic_tests/test_truth.py
3033
third_party/cupy/manipulation_tests/test_basic.py
3134
third_party/cupy/manipulation_tests/test_join.py

doc/reference/math.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ Handling complex numbers
169169
dpnp.imag
170170
dpnp.conj
171171
dpnp.conjugate
172+
dpnp.proj
172173

173174

174175
Extrema Finding

doc/reference/ufunc.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Math operations
4242
dpnp.log10
4343
dpnp.expm1
4444
dpnp.log1p
45+
dpnp.proj
4546
dpnp.sqrt
4647
dpnp.square
4748
dpnp.reciprocal

dpnp/backend/kernels/dpnp_krnl_bitwise.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,14 @@ DPCTLSyclEventRef dpnp_invert_c(DPCTLSyclQueueRef q_ref,
6868
sg.get_group_id()[0] * max_sg_size);
6969

7070
if (start + static_cast<size_t>(vec_sz) * max_sg_size < size) {
71-
using multi_ptrT =
72-
sycl::multi_ptr<_DataType,
73-
sycl::access::address_space::global_space>;
71+
auto input_multi_ptr = sycl::address_space_cast<
72+
sycl::access::address_space::global_space,
73+
sycl::access::decorated::yes>(&input_data[start]);
74+
auto result_multi_ptr = sycl::address_space_cast<
75+
sycl::access::address_space::global_space,
76+
sycl::access::decorated::yes>(&result[start]);
7477

75-
sycl::vec<_DataType, vec_sz> x =
76-
sg.load<vec_sz>(multi_ptrT(&input_data[start]));
78+
sycl::vec<_DataType, vec_sz> x = sg.load<vec_sz>(input_multi_ptr);
7779
sycl::vec<_DataType, vec_sz> res_vec;
7880

7981
if constexpr (std::is_same_v<_DataType, bool>) {
@@ -86,7 +88,7 @@ DPCTLSyclEventRef dpnp_invert_c(DPCTLSyclQueueRef q_ref,
8688
res_vec = ~x;
8789
}
8890

89-
sg.store<vec_sz>(multi_ptrT(&result[start]), res_vec);
91+
sg.store<vec_sz>(result_multi_ptr, res_vec);
9092
}
9193
else {
9294
for (size_t k = start + sg.get_local_id()[0]; k < size;

dpnp/backend/kernels/dpnp_krnl_elemwise.cpp

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,8 +1326,6 @@ static void func_map_init_elemwise_1arg_1type(func_map_t &fmap)
13261326
{ \
13271327
constexpr size_t lws = 64; \
13281328
constexpr unsigned int vec_sz = 8; \
1329-
constexpr sycl::access::address_space global_space = \
1330-
sycl::access::address_space::global_space; \
13311329
\
13321330
auto gws_range = sycl::range<1>( \
13331331
((result_size + lws * vec_sz - 1) / (lws * vec_sz)) * \
@@ -1344,12 +1342,17 @@ static void func_map_init_elemwise_1arg_1type(func_map_t &fmap)
13441342
\
13451343
if (start + static_cast<size_t>(vec_sz) * max_sg_size < \
13461344
result_size) { \
1347-
using input1_ptrT = \
1348-
sycl::multi_ptr<_DataType_input1, global_space>; \
1349-
using input2_ptrT = \
1350-
sycl::multi_ptr<_DataType_input2, global_space>; \
1351-
using result_ptrT = \
1352-
sycl::multi_ptr<_DataType_output, global_space>; \
1345+
auto input1_multi_ptr = sycl::address_space_cast< \
1346+
sycl::access::address_space::global_space, \
1347+
sycl::access::decorated::yes>( \
1348+
&input1_data[start]); \
1349+
auto input2_multi_ptr = sycl::address_space_cast< \
1350+
sycl::access::address_space::global_space, \
1351+
sycl::access::decorated::yes>( \
1352+
&input2_data[start]); \
1353+
auto result_multi_ptr = sycl::address_space_cast< \
1354+
sycl::access::address_space::global_space, \
1355+
sycl::access::decorated::yes>(&result[start]); \
13531356
\
13541357
sycl::vec<_DataType_output, vec_sz> res_vec; \
13551358
\
@@ -1363,11 +1366,9 @@ static void func_map_init_elemwise_1arg_1type(func_map_t &fmap)
13631366
_DataType_output>) \
13641367
{ \
13651368
sycl::vec<_DataType_input1, vec_sz> x1 = \
1366-
sg.load<vec_sz>( \
1367-
input1_ptrT(&input1_data[start])); \
1369+
sg.load<vec_sz>(input1_multi_ptr); \
13681370
sycl::vec<_DataType_input2, vec_sz> x2 = \
1369-
sg.load<vec_sz>( \
1370-
input2_ptrT(&input2_data[start])); \
1371+
sg.load<vec_sz>(input2_multi_ptr); \
13711372
\
13721373
res_vec = __vec_operation__; \
13731374
} \
@@ -1377,33 +1378,28 @@ static void func_map_init_elemwise_1arg_1type(func_map_t &fmap)
13771378
sycl::vec<_DataType_output, vec_sz> x1 = \
13781379
dpnp_vec_cast<_DataType_output, \
13791380
_DataType_input1, vec_sz>( \
1380-
sg.load<vec_sz>(input1_ptrT( \
1381-
&input1_data[start]))); \
1381+
sg.load<vec_sz>(input1_multi_ptr)); \
13821382
sycl::vec<_DataType_output, vec_sz> x2 = \
13831383
dpnp_vec_cast<_DataType_output, \
13841384
_DataType_input2, vec_sz>( \
1385-
sg.load<vec_sz>(input2_ptrT( \
1386-
&input2_data[start]))); \
1385+
sg.load<vec_sz>(input2_multi_ptr)); \
13871386
\
13881387
res_vec = __vec_operation__; \
13891388
} \
13901389
} \
13911390
else { \
13921391
sycl::vec<_DataType_input1, vec_sz> x1 = \
1393-
sg.load<vec_sz>( \
1394-
input1_ptrT(&input1_data[start])); \
1392+
sg.load<vec_sz>(input1_multi_ptr); \
13951393
sycl::vec<_DataType_input2, vec_sz> x2 = \
1396-
sg.load<vec_sz>( \
1397-
input2_ptrT(&input2_data[start])); \
1394+
sg.load<vec_sz>(input2_multi_ptr); \
13981395
\
13991396
for (size_t k = 0; k < vec_sz; ++k) { \
14001397
const _DataType_output input1_elem = x1[k]; \
14011398
const _DataType_output input2_elem = x2[k]; \
14021399
res_vec[k] = __operation__; \
14031400
} \
14041401
} \
1405-
sg.store<vec_sz>(result_ptrT(&result[start]), \
1406-
res_vec); \
1402+
sg.store<vec_sz>(result_multi_ptr, res_vec); \
14071403
} \
14081404
else { \
14091405
for (size_t k = start + sg.get_local_id()[0]; \

dpnp/backend/kernels/dpnp_krnl_logic.cpp

Lines changed: 104 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ DPCTLSyclEventRef dpnp_all_c(DPCTLSyclQueueRef q_ref,
7474
sycl::nd_range<1> gws(gws_range, lws_range);
7575

7676
auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
77-
auto gr = nd_it.get_group();
77+
auto gr = nd_it.get_sub_group();
7878
const auto max_gr_size = gr.get_max_local_range()[0];
7979
const size_t start =
8080
vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +
@@ -127,8 +127,79 @@ DPCTLSyclEventRef (*dpnp_all_ext_c)(DPCTLSyclQueueRef,
127127
const DPCTLEventVectorRef) =
128128
dpnp_all_c<_DataType, _ResultType>;
129129

130-
template <typename _DataType1, typename _DataType2, typename _ResultType>
131-
class dpnp_allclose_c_kernel;
130+
template <typename _DataType1, typename _DataType2, typename _TolType>
131+
class dpnp_allclose_kernel;
132+
133+
template <typename _DataType1, typename _DataType2, typename _TolType>
134+
static sycl::event dpnp_allclose(sycl::queue &q,
135+
const _DataType1 *array1,
136+
const _DataType2 *array2,
137+
bool *result,
138+
const size_t size,
139+
const _TolType rtol_val,
140+
const _TolType atol_val)
141+
{
142+
sycl::event fill_event = q.fill(result, true, 1);
143+
if (!size) {
144+
return fill_event;
145+
}
146+
147+
constexpr size_t lws = 64;
148+
constexpr size_t vec_sz = 8;
149+
150+
auto gws_range =
151+
sycl::range<1>(((size + lws * vec_sz - 1) / (lws * vec_sz)) * lws);
152+
auto lws_range = sycl::range<1>(lws);
153+
sycl::nd_range<1> gws(gws_range, lws_range);
154+
155+
auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
156+
auto gr = nd_it.get_sub_group();
157+
const auto max_gr_size = gr.get_max_local_range()[0];
158+
const auto gr_size = gr.get_local_linear_range();
159+
const size_t start =
160+
vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +
161+
gr.get_group_linear_id() * max_gr_size);
162+
const size_t end = sycl::min(start + vec_sz * gr_size, size);
163+
164+
// each work-item iterates over "vec_sz" elements in the input arrays
165+
bool partial = true;
166+
167+
for (size_t i = start + gr.get_local_linear_id(); i < end; i += gr_size)
168+
{
169+
if constexpr (std::is_floating_point_v<_DataType1> &&
170+
std::is_floating_point_v<_DataType2>)
171+
{
172+
if (std::isinf(array1[i]) || std::isinf(array2[i])) {
173+
partial &= (array1[i] == array2[i]);
174+
continue;
175+
}
176+
}
177+
178+
// casting integeral to floating type to avoid bad behavior
179+
// on abs(MIN_INT), which leads to undefined result
180+
using _Arr2Type = std::conditional_t<std::is_integral_v<_DataType2>,
181+
_TolType, _DataType2>;
182+
_Arr2Type arr2 = static_cast<_Arr2Type>(array2[i]);
183+
184+
partial &= (std::abs(array1[i] - arr2) <=
185+
(atol_val + rtol_val * std::abs(arr2)));
186+
}
187+
partial = sycl::all_of_group(gr, partial);
188+
189+
if (gr.leader() && (partial == false)) {
190+
result[0] = false;
191+
}
192+
};
193+
194+
auto kernel_func = [&](sycl::handler &cgh) {
195+
cgh.depends_on(fill_event);
196+
cgh.parallel_for<
197+
class dpnp_allclose_kernel<_DataType1, _DataType2, _TolType>>(
198+
gws, kernel_parallel_for_func);
199+
};
200+
201+
return q.submit(kernel_func);
202+
}
132203

133204
template <typename _DataType1, typename _DataType2, typename _ResultType>
134205
DPCTLSyclEventRef dpnp_allclose_c(DPCTLSyclQueueRef q_ref,
@@ -140,6 +211,9 @@ DPCTLSyclEventRef dpnp_allclose_c(DPCTLSyclQueueRef q_ref,
140211
double atol_val,
141212
const DPCTLEventVectorRef dep_event_vec_ref)
142213
{
214+
static_assert(std::is_same_v<_ResultType, bool>,
215+
"Boolean result type is required");
216+
143217
// avoid warning unused variable
144218
(void)dep_event_vec_ref;
145219

@@ -152,40 +226,21 @@ DPCTLSyclEventRef dpnp_allclose_c(DPCTLSyclQueueRef q_ref,
152226
sycl::queue q = *(reinterpret_cast<sycl::queue *>(q_ref));
153227
sycl::event event;
154228

155-
DPNPC_ptr_adapter<_DataType1> input1_ptr(q_ref, array1_in, size);
156-
DPNPC_ptr_adapter<_DataType2> input2_ptr(q_ref, array2_in, size);
157-
DPNPC_ptr_adapter<_ResultType> result1_ptr(q_ref, result1, 1, true, true);
158-
const _DataType1 *array1 = input1_ptr.get_ptr();
159-
const _DataType2 *array2 = input2_ptr.get_ptr();
160-
_ResultType *result = result1_ptr.get_ptr();
161-
162-
result[0] = true;
229+
const _DataType1 *array1 = static_cast<const _DataType1 *>(array1_in);
230+
const _DataType2 *array2 = static_cast<const _DataType2 *>(array2_in);
231+
bool *result = static_cast<bool *>(result1);
163232

164-
if (!size) {
165-
return event_ref;
233+
if (q.get_device().has(sycl::aspect::fp64)) {
234+
event =
235+
dpnp_allclose(q, array1, array2, result, size, rtol_val, atol_val);
236+
}
237+
else {
238+
float rtol = static_cast<float>(rtol_val);
239+
float atol = static_cast<float>(atol_val);
240+
event = dpnp_allclose(q, array1, array2, result, size, rtol, atol);
166241
}
167-
168-
sycl::range<1> gws(size);
169-
auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {
170-
size_t i = global_id[0];
171-
172-
if (std::abs(array1[i] - array2[i]) >
173-
(atol_val + rtol_val * std::abs(array2[i])))
174-
{
175-
result[0] = false;
176-
}
177-
};
178-
179-
auto kernel_func = [&](sycl::handler &cgh) {
180-
cgh.parallel_for<
181-
class dpnp_allclose_c_kernel<_DataType1, _DataType2, _ResultType>>(
182-
gws, kernel_parallel_for_func);
183-
};
184-
185-
event = q.submit(kernel_func);
186242

187243
event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
188-
189244
return DPCTLEvent_Copy(event_ref);
190245
}
191246

@@ -269,7 +324,7 @@ DPCTLSyclEventRef dpnp_any_c(DPCTLSyclQueueRef q_ref,
269324
sycl::nd_range<1> gws(gws_range, lws_range);
270325

271326
auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
272-
auto gr = nd_it.get_group();
327+
auto gr = nd_it.get_sub_group();
273328
const auto max_gr_size = gr.get_max_local_range()[0];
274329
const size_t start =
275330
vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +
@@ -521,8 +576,6 @@ DPCTLSyclEventRef (*dpnp_any_ext_c)(DPCTLSyclQueueRef,
521576
else { \
522577
constexpr size_t lws = 64; \
523578
constexpr unsigned int vec_sz = 8; \
524-
constexpr sycl::access::address_space global_space = \
525-
sycl::access::address_space::global_space; \
526579
\
527580
auto gws_range = sycl::range<1>( \
528581
((result_size + lws * vec_sz - 1) / (lws * vec_sz)) * lws); \
@@ -537,22 +590,28 @@ DPCTLSyclEventRef (*dpnp_any_ext_c)(DPCTLSyclQueueRef,
537590
\
538591
if (start + static_cast<size_t>(vec_sz) * max_sg_size < \
539592
result_size) { \
540-
sycl::vec<_DataType_input1, vec_sz> x1 = sg.load<vec_sz>( \
541-
sycl::multi_ptr<_DataType_input1, global_space>( \
542-
&input1_data[start])); \
543-
sycl::vec<_DataType_input2, vec_sz> x2 = sg.load<vec_sz>( \
544-
sycl::multi_ptr<_DataType_input2, global_space>( \
545-
&input2_data[start])); \
593+
auto input1_multi_ptr = sycl::address_space_cast< \
594+
sycl::access::address_space::global_space, \
595+
sycl::access::decorated::yes>(&input1_data[start]); \
596+
auto input2_multi_ptr = sycl::address_space_cast< \
597+
sycl::access::address_space::global_space, \
598+
sycl::access::decorated::yes>(&input2_data[start]); \
599+
auto result_multi_ptr = sycl::address_space_cast< \
600+
sycl::access::address_space::global_space, \
601+
sycl::access::decorated::yes>(&result[start]); \
602+
\
603+
sycl::vec<_DataType_input1, vec_sz> x1 = \
604+
sg.load<vec_sz>(input1_multi_ptr); \
605+
sycl::vec<_DataType_input2, vec_sz> x2 = \
606+
sg.load<vec_sz>(input2_multi_ptr); \
546607
sycl::vec<bool, vec_sz> res_vec; \
547608
\
548609
for (size_t k = 0; k < vec_sz; ++k) { \
549610
const _DataType_input1 input1_elem = x1[k]; \
550611
const _DataType_input2 input2_elem = x2[k]; \
551612
res_vec[k] = __operation__; \
552613
} \
553-
sg.store<vec_sz>( \
554-
sycl::multi_ptr<bool, global_space>(&result[start]), \
555-
res_vec); \
614+
sg.store<vec_sz>(result_multi_ptr, res_vec); \
556615
} \
557616
else { \
558617
for (size_t k = start; k < result_size; ++k) { \

0 commit comments

Comments
 (0)