Skip to content

Commit 1dd01fa

Browse files
committed
Reduced number of computations for floor_divide between integers
- Rather than computing division and modulo for each element for sycl::vec, instead the vector is initialized and filled per-element
1 parent c31d020 commit 1dd01fa

File tree

1 file changed

+32
-44
lines changed

1 file changed

+32
-44
lines changed

dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp

Lines changed: 32 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -52,77 +52,59 @@ namespace tu_ns = dpctl::tensor::type_utils;
5252
template <typename argT1, typename argT2, typename resT>
5353
struct FloorDivideFunctor
5454
{
55-
56-
using supports_sg_loadstore = std::negation<
57-
std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
58-
using supports_vec = std::negation<
59-
std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
55+
using supports_sg_loadstore = std::true_type;
56+
using supports_vec = std::true_type;
6057

6158
resT operator()(const argT1 &in1, const argT2 &in2)
6259
{
6360
if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
64-
static_assert(std::is_same_v<argT1, argT2>);
65-
if (in2 == 0) {
61+
if (in2 == argT2(0)) {
6662
return resT(0);
6763
}
68-
auto tmp = in1 / in2;
69-
if constexpr (std::is_unsigned_v<argT1> ||
70-
std::is_unsigned_v<argT2>) {
71-
return tmp;
64+
if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
65+
auto div = in1 / in2;
66+
auto mod = in1 % in2;
67+
auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
68+
return (div - corr);
7269
}
7370
else {
74-
auto rem = in1 % in2;
75-
auto corr = (rem != 0 && ((rem < 0) != (in2 < 0)));
76-
return (tmp - corr);
71+
return (in1 / in2);
7772
}
7873
}
7974
else {
80-
auto tmp = in1 / in2;
81-
return (tmp == 0) ? resT(tmp) : resT(std::floor(tmp));
75+
auto div = in1 / in2;
76+
return (div == resT(0)) ? div : resT(std::floor(div));
8277
}
8378
}
8479

8580
template <int vec_sz>
8681
sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
8782
const sycl::vec<argT2, vec_sz> &in2)
8883
{
89-
auto tmp = in1 / in2;
90-
using tmpT = typename decltype(tmp)::element_type;
91-
if constexpr (std::is_integral_v<tmpT>) {
92-
if constexpr (std::is_unsigned_v<tmpT>) {
84+
if constexpr (std::is_integral_v<resT>) {
85+
sycl::vec<resT, vec_sz> res;
9386
#pragma unroll
94-
for (int i = 0; i < vec_sz; ++i) {
95-
if (in2[i] == argT2(0)) {
96-
tmp[i] = tmpT(0);
97-
}
87+
for (int i = 0; i < vec_sz; ++i) {
88+
if (in2[i] == argT2(0)) {
89+
res[i] = resT(0);
9890
}
99-
}
100-
else {
101-
auto rem = in1 % in2;
102-
#pragma unroll
103-
for (int i = 0; i < vec_sz; ++i) {
104-
if (in2[i] == 0) {
105-
tmp[i] = tmpT(0);
106-
}
107-
else {
108-
tmpT corr =
109-
(rem[i] != 0 && ((rem[i] < 0) != (in2[i] < 0)));
110-
tmp[i] -= corr;
91+
else {
92+
res[i] = in1[i] / in2[i];
93+
if constexpr (std::is_signed_v<resT>) {
94+
auto mod = in1[i] % in2[i];
95+
auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
96+
res[i] -= corr;
11197
}
11298
}
11399
}
114-
if constexpr (std::is_same_v<resT, tmpT>) {
115-
return tmp;
116-
}
117-
else {
118-
using dpctl::tensor::type_utils::vec_cast;
119-
return vec_cast<resT, tmpT, vec_sz>(tmp);
120-
}
100+
return res;
121101
}
122102
else {
103+
auto tmp = in1 / in2;
104+
using tmpT = typename decltype(tmp)::element_type;
123105
#pragma unroll
124106
for (int i = 0; i < vec_sz; ++i) {
125-
if (in2[i] != 0) {
107+
if (in2[i] != argT2(0)) {
126108
tmp[i] = std::floor(tmp[i]);
127109
}
128110
}
@@ -135,6 +117,12 @@ struct FloorDivideFunctor
135117
}
136118
}
137119
}
120+
121+
private:
122+
bool l_xor(bool b1, bool b2) const
123+
{
124+
return (b1 != b2);
125+
}
138126
};
139127

140128
template <typename argT1,

0 commit comments

Comments
 (0)