Skip to content

Commit 6e82889

Browse files
Moved np.ndarray->dpt.usm_ndarray cast-and-copy out to separate file
1 parent 9afc181 commit 6e82889

File tree

4 files changed

+331
-223
lines changed

4 files changed

+331
-223
lines changed

dpctl/tensor/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ pybind11_add_module(${python_module_name} MODULE
2020
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
2121
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
2222
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
23+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
2324
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
2425
)
2526
target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===//
2+
//
3+
// Data Parallel Control (dpctl)
4+
//
5+
// Copyright 2020-2022 Intel Corporation
6+
//
7+
// Licensed under the Apache License, Version 2.0 (the "License");
8+
// you may not use this file except in compliance with the License.
9+
// You may obtain a copy of the License at
10+
//
11+
// http://www.apache.org/licenses/LICENSE-2.0
12+
//
13+
// Unless required by applicable law or agreed to in writing, software
14+
// distributed under the License is distributed on an "AS IS" BASIS,
15+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
// See the License for the specific language governing permissions and
17+
// limitations under the License.
18+
//
19+
//===----------------------------------------------------------------------===//
20+
///
21+
/// \file
22+
/// This file defines functions of dpctl.tensor._tensor_impl extensions
23+
//===----------------------------------------------------------------------===//
24+
25+
#include <CL/sycl.hpp>
26+
#include <vector>
27+
28+
#include "dpctl4pybind11.hpp"
29+
#include <pybind11/numpy.h>
30+
#include <pybind11/pybind11.h>
31+
32+
#include "kernels/copy_and_cast.hpp"
33+
#include "utils/strided_iters.hpp"
34+
#include "utils/type_dispatch.hpp"
35+
36+
#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
37+
#include "simplify_iteration_space.hpp"
38+
39+
namespace py = pybind11;
40+
namespace _ns = dpctl::tensor::detail;
41+
42+
namespace dpctl
43+
{
44+
namespace tensor
45+
{
46+
namespace py_internal
47+
{
48+
49+
using dpctl::tensor::kernels::copy_and_cast::
50+
copy_and_cast_from_host_blocking_fn_ptr_t;
51+
52+
static copy_and_cast_from_host_blocking_fn_ptr_t
53+
copy_and_cast_from_host_blocking_dispatch_table[_ns::num_types]
54+
[_ns::num_types];
55+
56+
void copy_numpy_ndarray_into_usm_ndarray(
57+
py::array npy_src,
58+
dpctl::tensor::usm_ndarray dst,
59+
sycl::queue exec_q,
60+
const std::vector<sycl::event> &depends)
61+
{
62+
int src_ndim = npy_src.ndim();
63+
int dst_ndim = dst.get_ndim();
64+
65+
if (src_ndim != dst_ndim) {
66+
throw py::value_error("Source ndarray and destination usm_ndarray have "
67+
"different array ranks, "
68+
"i.e. different number of indices needed to "
69+
"address array elements.");
70+
}
71+
72+
const py::ssize_t *src_shape = npy_src.shape();
73+
const py::ssize_t *dst_shape = dst.get_shape_raw();
74+
bool shapes_equal(true);
75+
size_t src_nelems(1);
76+
for (int i = 0; i < src_ndim; ++i) {
77+
shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
78+
src_nelems *= static_cast<size_t>(src_shape[i]);
79+
}
80+
81+
if (!shapes_equal) {
82+
throw py::value_error("Source ndarray and destination usm_ndarray have "
83+
"difference shapes.");
84+
}
85+
86+
if (src_nelems == 0) {
87+
// nothing to do
88+
return;
89+
}
90+
91+
auto dst_offsets = dst.get_minmax_offsets();
92+
// destination must be ample enough to accomodate all elements of source
93+
// array
94+
{
95+
size_t range =
96+
static_cast<size_t>(dst_offsets.second - dst_offsets.first);
97+
if (range + 1 < src_nelems) {
98+
throw py::value_error(
99+
"Destination array can not accomodate all the "
100+
"elements of source array.");
101+
}
102+
}
103+
104+
sycl::queue dst_q = dst.get_queue();
105+
106+
if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
107+
throw py::value_error("Execution queue is not compatible with the "
108+
"allocation queue");
109+
}
110+
111+
// here we assume that NumPy's type numbers agree with ours for types
112+
// supported in both
113+
int src_typenum =
114+
py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
115+
int dst_typenum = dst.get_typenum();
116+
117+
auto array_types = dpctl::tensor::detail::usm_ndarray_types();
118+
int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
119+
int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
120+
121+
py::buffer_info src_pybuf = npy_src.request();
122+
const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
123+
char *dst_data = dst.get_data();
124+
125+
int src_flags = npy_src.flags();
126+
127+
// check for applicability of special cases:
128+
// (same type && (both C-contiguous || both F-contiguous)
129+
bool both_c_contig =
130+
((src_flags & py::array::c_style) && dst.is_c_contiguous());
131+
bool both_f_contig =
132+
((src_flags & py::array::f_style) && dst.is_f_contiguous());
133+
if (both_c_contig || both_f_contig) {
134+
if (src_type_id == dst_type_id) {
135+
int src_elem_size = npy_src.itemsize();
136+
137+
sycl::event copy_ev =
138+
exec_q.memcpy(static_cast<void *>(dst_data),
139+
static_cast<const void *>(src_data),
140+
src_nelems * src_elem_size, depends);
141+
142+
// wait for copy_ev to complete
143+
copy_ev.wait_and_throw();
144+
145+
return;
146+
}
147+
// With contract_iter2 in place, there is no need to write
148+
// dedicated kernels for casting between contiguous arrays
149+
}
150+
151+
const py::ssize_t *src_strides =
152+
npy_src.strides(); // N.B.: strides in bytes
153+
const py::ssize_t *dst_strides =
154+
dst.get_strides_raw(); // N.B.: strides in elements
155+
156+
using shT = std::vector<py::ssize_t>;
157+
shT simplified_shape;
158+
shT simplified_src_strides;
159+
shT simplified_dst_strides;
160+
py::ssize_t src_offset(0);
161+
py::ssize_t dst_offset(0);
162+
163+
py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
164+
constexpr py::ssize_t dst_itemsize = 1; // item size in elements
165+
166+
int nd = src_ndim;
167+
const py::ssize_t *shape = src_shape;
168+
169+
bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
170+
bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
171+
172+
bool is_dst_c_contig = dst.is_c_contiguous();
173+
bool is_dst_f_contig = dst.is_f_contiguous();
174+
175+
// all args except itemsizes and is_?_contig bools can be modified by
176+
// reference
177+
simplify_iteration_space(nd, shape, src_strides, src_itemsize,
178+
is_src_c_contig, is_src_f_contig, dst_strides,
179+
dst_itemsize, is_dst_c_contig, is_dst_f_contig,
180+
simplified_shape, simplified_src_strides,
181+
simplified_dst_strides, src_offset, dst_offset);
182+
183+
assert(simplified_shape.size() == static_cast<size_t>(nd));
184+
assert(simplified_src_strides.size() == static_cast<size_t>(nd));
185+
assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
186+
187+
// handle nd == 0
188+
if (nd == 0) {
189+
nd = 1;
190+
simplified_shape.reserve(nd);
191+
simplified_shape.push_back(1);
192+
193+
simplified_src_strides.reserve(nd);
194+
simplified_src_strides.push_back(src_itemsize);
195+
196+
simplified_dst_strides.reserve(nd);
197+
simplified_dst_strides.push_back(dst_itemsize);
198+
}
199+
200+
// Minumum and maximum element offsets for source np.ndarray
201+
py::ssize_t npy_src_min_nelem_offset(0);
202+
py::ssize_t npy_src_max_nelem_offset(0);
203+
for (int i = 0; i < nd; ++i) {
204+
// convert source strides from bytes to elements
205+
simplified_src_strides[i] = simplified_src_strides[i] / src_itemsize;
206+
if (simplified_src_strides[i] < 0) {
207+
npy_src_min_nelem_offset +=
208+
simplified_src_strides[i] * (simplified_shape[i] - 1);
209+
}
210+
else {
211+
npy_src_max_nelem_offset +=
212+
simplified_src_strides[i] * (simplified_shape[i] - 1);
213+
}
214+
}
215+
216+
// Create shared pointers with shape and src/dst strides, copy into device
217+
// memory
218+
using shT = std::vector<py::ssize_t>;
219+
220+
// Get implementation function pointer
221+
auto copy_and_cast_from_host_blocking_fn =
222+
copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
223+
[src_type_id];
224+
225+
// If shape/strides are accessed with accessors, buffer destructor
226+
// will force syncronization.
227+
py::ssize_t *shape_strides =
228+
sycl::malloc_device<py::ssize_t>(3 * nd, exec_q);
229+
230+
if (shape_strides == nullptr) {
231+
throw std::runtime_error("Unabled to allocate device memory");
232+
}
233+
234+
using usm_host_allocatorT =
235+
sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
236+
using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
237+
usm_host_allocatorT alloc(exec_q);
238+
239+
auto host_shape_strides_shp = std::make_shared<usmshT>(3 * nd, alloc);
240+
std::copy(simplified_shape.begin(), simplified_shape.end(),
241+
host_shape_strides_shp->begin());
242+
std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
243+
host_shape_strides_shp->begin() + nd);
244+
std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
245+
host_shape_strides_shp->begin() + 2 * nd);
246+
247+
sycl::event copy_packed_ev =
248+
exec_q.copy<py::ssize_t>(host_shape_strides_shp->data(), shape_strides,
249+
host_shape_strides_shp->size());
250+
251+
copy_and_cast_from_host_blocking_fn(
252+
exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
253+
npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
254+
dst_offset, depends, {copy_packed_ev});
255+
256+
sycl::free(shape_strides, exec_q);
257+
258+
return;
259+
}
260+
261+
void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
262+
{
263+
using namespace dpctl::tensor::detail;
264+
using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
265+
266+
DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
267+
CopyAndCastFromHostFactory, _ns::num_types>
268+
dtb_copy_from_numpy;
269+
270+
dtb_copy_from_numpy.populate_dispatch_table(
271+
copy_and_cast_from_host_blocking_dispatch_table);
272+
}
273+
274+
} // namespace py_internal
275+
} // namespace tensor
276+
} // namespace dpctl
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===//
2+
//
3+
// Data Parallel Control (dpctl)
4+
//
5+
// Copyright 2020-2022 Intel Corporation
6+
//
7+
// Licensed under the Apache License, Version 2.0 (the "License");
8+
// you may not use this file except in compliance with the License.
9+
// You may obtain a copy of the License at
10+
//
11+
// http://www.apache.org/licenses/LICENSE-2.0
12+
//
13+
// Unless required by applicable law or agreed to in writing, software
14+
// distributed under the License is distributed on an "AS IS" BASIS,
15+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
// See the License for the specific language governing permissions and
17+
// limitations under the License.
18+
//
19+
//===----------------------------------------------------------------------===//
20+
///
21+
/// \file
22+
/// This file defines functions of dpctl.tensor._tensor_impl extensions
23+
//===----------------------------------------------------------------------===//
24+
25+
#pragma once
26+
#include <CL/sycl.hpp>
27+
#include <vector>
28+
29+
#include "dpctl4pybind11.hpp"
30+
#include <pybind11/numpy.h>
31+
#include <pybind11/pybind11.h>
32+
33+
namespace dpctl
34+
{
35+
namespace tensor
36+
{
37+
namespace py_internal
38+
{
39+
40+
extern void copy_numpy_ndarray_into_usm_ndarray(
41+
py::array npy_src,
42+
dpctl::tensor::usm_ndarray dst,
43+
sycl::queue exec_q,
44+
const std::vector<sycl::event> &depends = {});
45+
46+
extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
47+
48+
} // namespace py_internal
49+
} // namespace tensor
50+
} // namespace dpctl

0 commit comments

Comments
 (0)