Skip to content

Commit 1235902

Browse files
Change vec_sz and n_vecs settings for contiguous kernels
For short data types, each work-item may need to load several elements to ensure that it uses all the data from cache-line. For example, with simd32, we load 4 8-bit types (2 cache lines), 2 16-bit types, 1 32-bit and wider types. n_vec is set to 1, to avoid cache thrashing due to second iteration of some work-items beginning to access memory at higher addresses while some work-items continue working on the lower addresses causing cache evictions. The size of the work-groups was increated from 128 to 256, which is chosen so that all 8 threads of single vector with simd32 are used.
1 parent f623209 commit 1235902

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+573
-265
lines changed

dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@
3232
#include <type_traits>
3333

3434
#include "cabs_impl.hpp"
35-
#include "kernels/elementwise_functions/common.hpp"
35+
#include "vec_size_util.hpp"
3636

3737
#include "kernels/dpctl_tensor_types.hpp"
38+
#include "kernels/elementwise_functions/common.hpp"
39+
3840
#include "utils/offset_utils.hpp"
3941
#include "utils/type_dispatch_building.hpp"
4042
#include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace abs
5052

5153
namespace td_ns = dpctl::tensor::type_dispatch;
5254

55+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5356
using dpctl::tensor::type_utils::is_complex;
5457

5558
template <typename argT, typename resT> struct AbsFunctor
@@ -89,8 +92,8 @@ template <typename argT, typename resT> struct AbsFunctor
8992

9093
template <typename argT,
9194
typename resT = argT,
92-
unsigned int vec_sz = 4,
93-
unsigned int n_vecs = 2,
95+
unsigned int vec_sz = VecSize_v<argT, resT>,
96+
unsigned int n_vecs = 1,
9497
bool enable_sg_loadstore = true>
9598
using AbsContigFunctor =
9699
elementwise_common::UnaryContigFunctor<argT,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@
2929
#include <sycl/sycl.hpp>
3030
#include <type_traits>
3131

32-
#include "kernels/elementwise_functions/common.hpp"
3332
#include "sycl_complex.hpp"
33+
#include "vec_size_util.hpp"
3434

3535
#include "kernels/dpctl_tensor_types.hpp"
36+
#include "kernels/elementwise_functions/common.hpp"
37+
3638
#include "utils/offset_utils.hpp"
3739
#include "utils/type_dispatch_building.hpp"
3840
#include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace acos
4850

4951
namespace td_ns = dpctl::tensor::type_dispatch;
5052

53+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5154
using dpctl::tensor::type_utils::is_complex;
5255

5356
template <typename argT, typename resT> struct AcosFunctor
@@ -128,8 +131,8 @@ template <typename argT, typename resT> struct AcosFunctor
128131

129132
template <typename argTy,
130133
typename resTy = argTy,
131-
unsigned int vec_sz = 4,
132-
unsigned int n_vecs = 2,
134+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
135+
unsigned int n_vecs = 1,
133136
bool enable_sg_loadstore = true>
134137
using AcosContigFunctor =
135138
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@
2929
#include <sycl/sycl.hpp>
3030
#include <type_traits>
3131

32-
#include "kernels/elementwise_functions/common.hpp"
3332
#include "sycl_complex.hpp"
33+
#include "vec_size_util.hpp"
3434

3535
#include "kernels/dpctl_tensor_types.hpp"
36+
#include "kernels/elementwise_functions/common.hpp"
37+
3638
#include "utils/offset_utils.hpp"
3739
#include "utils/type_dispatch_building.hpp"
3840
#include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace acosh
4850

4951
namespace td_ns = dpctl::tensor::type_dispatch;
5052

53+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5154
using dpctl::tensor::type_utils::is_complex;
5255

5356
template <typename argT, typename resT> struct AcoshFunctor
@@ -155,8 +158,8 @@ template <typename argT, typename resT> struct AcoshFunctor
155158

156159
template <typename argTy,
157160
typename resTy = argTy,
158-
unsigned int vec_sz = 4,
159-
unsigned int n_vecs = 2,
161+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
162+
unsigned int n_vecs = 1,
160163
bool enable_sg_loadstore = true>
161164
using AcoshContigFunctor =
162165
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include <type_traits>
3131

3232
#include "sycl_complex.hpp"
33+
#include "vec_size_util.hpp"
34+
3335
#include "utils/offset_utils.hpp"
3436
#include "utils/type_dispatch_building.hpp"
3537
#include "utils/type_utils.hpp"
@@ -50,6 +52,8 @@ namespace add
5052
namespace td_ns = dpctl::tensor::type_dispatch;
5153
namespace tu_ns = dpctl::tensor::type_utils;
5254

55+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
56+
5357
template <typename argT1, typename argT2, typename resT> struct AddFunctor
5458
{
5559

@@ -110,8 +114,8 @@ template <typename argT1, typename argT2, typename resT> struct AddFunctor
110114
template <typename argT1,
111115
typename argT2,
112116
typename resT,
113-
unsigned int vec_sz = 4,
114-
unsigned int n_vecs = 2,
117+
unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
118+
unsigned int n_vecs = 1,
115119
bool enable_sg_loadstore = true>
116120
using AddContigFunctor =
117121
elementwise_common::BinaryContigFunctor<argT1,
@@ -410,8 +414,8 @@ template <typename argT, typename resT> struct AddInplaceFunctor
410414

411415
template <typename argT,
412416
typename resT,
413-
unsigned int vec_sz = 4,
414-
unsigned int n_vecs = 2,
417+
unsigned int vec_sz = VecSize_v<argT, resT>,
418+
unsigned int n_vecs = 1,
415419
bool enable_sg_loadstore = true>
416420
using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
417421
argT,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@
3030
#include <sycl/sycl.hpp>
3131
#include <type_traits>
3232

33-
#include "kernels/elementwise_functions/common.hpp"
3433
#include "sycl_complex.hpp"
34+
#include "vec_size_util.hpp"
3535

3636
#include "kernels/dpctl_tensor_types.hpp"
37+
#include "kernels/elementwise_functions/common.hpp"
38+
3739
#include "utils/offset_utils.hpp"
3840
#include "utils/type_dispatch_building.hpp"
3941
#include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace angle
4951

5052
namespace td_ns = dpctl::tensor::type_dispatch;
5153

54+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5255
using dpctl::tensor::type_utils::is_complex;
5356

5457
template <typename argT, typename resT> struct AngleFunctor
@@ -74,8 +77,8 @@ template <typename argT, typename resT> struct AngleFunctor
7477

7578
template <typename argTy,
7679
typename resTy = argTy,
77-
unsigned int vec_sz = 4,
78-
unsigned int n_vecs = 2,
80+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
81+
unsigned int n_vecs = 1,
7982
bool enable_sg_loadstore = true>
8083
using AngleContigFunctor =
8184
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@
2929
#include <sycl/sycl.hpp>
3030
#include <type_traits>
3131

32-
#include "kernels/elementwise_functions/common.hpp"
3332
#include "sycl_complex.hpp"
33+
#include "vec_size_util.hpp"
3434

3535
#include "kernels/dpctl_tensor_types.hpp"
36+
#include "kernels/elementwise_functions/common.hpp"
37+
3638
#include "utils/offset_utils.hpp"
3739
#include "utils/type_dispatch_building.hpp"
3840
#include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace asin
4850

4951
namespace td_ns = dpctl::tensor::type_dispatch;
5052

53+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5154
using dpctl::tensor::type_utils::is_complex;
5255

5356
template <typename argT, typename resT> struct AsinFunctor
@@ -148,8 +151,8 @@ template <typename argT, typename resT> struct AsinFunctor
148151

149152
template <typename argTy,
150153
typename resTy = argTy,
151-
unsigned int vec_sz = 4,
152-
unsigned int n_vecs = 2,
154+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
155+
unsigned int n_vecs = 1,
153156
bool enable_sg_loadstore = true>
154157
using AsinContigFunctor =
155158
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@
2929
#include <sycl/sycl.hpp>
3030
#include <type_traits>
3131

32-
#include "kernels/elementwise_functions/common.hpp"
3332
#include "sycl_complex.hpp"
33+
#include "vec_size_util.hpp"
3434

3535
#include "kernels/dpctl_tensor_types.hpp"
36+
#include "kernels/elementwise_functions/common.hpp"
37+
3638
#include "utils/offset_utils.hpp"
3739
#include "utils/type_dispatch_building.hpp"
3840
#include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace asinh
4850

4951
namespace td_ns = dpctl::tensor::type_dispatch;
5052

53+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5154
using dpctl::tensor::type_utils::is_complex;
5255

5356
template <typename argT, typename resT> struct AsinhFunctor
@@ -131,8 +134,8 @@ template <typename argT, typename resT> struct AsinhFunctor
131134

132135
template <typename argTy,
133136
typename resTy = argTy,
134-
unsigned int vec_sz = 4,
135-
unsigned int n_vecs = 2,
137+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
138+
unsigned int n_vecs = 1,
136139
bool enable_sg_loadstore = true>
137140
using AsinhContigFunctor =
138141
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@
3030
#include <sycl/sycl.hpp>
3131
#include <type_traits>
3232

33-
#include "kernels/elementwise_functions/common.hpp"
3433
#include "sycl_complex.hpp"
34+
#include "vec_size_util.hpp"
3535

3636
#include "kernels/dpctl_tensor_types.hpp"
37+
#include "kernels/elementwise_functions/common.hpp"
38+
3739
#include "utils/offset_utils.hpp"
3840
#include "utils/type_dispatch_building.hpp"
3941
#include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace atan
4951

5052
namespace td_ns = dpctl::tensor::type_dispatch;
5153

54+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5255
using dpctl::tensor::type_utils::is_complex;
5356

5457
template <typename argT, typename resT> struct AtanFunctor
@@ -138,8 +141,8 @@ template <typename argT, typename resT> struct AtanFunctor
138141

139142
template <typename argTy,
140143
typename resTy = argTy,
141-
unsigned int vec_sz = 4,
142-
unsigned int n_vecs = 2,
144+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
145+
unsigned int n_vecs = 1,
143146
bool enable_sg_loadstore = true>
144147
using AtanContigFunctor =
145148
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
#include <sycl/sycl.hpp>
3030
#include <type_traits>
3131

32+
#include "vec_size_util.hpp"
33+
3234
#include "utils/offset_utils.hpp"
3335
#include "utils/type_dispatch_building.hpp"
3436
#include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace atan2
4850
namespace td_ns = dpctl::tensor::type_dispatch;
4951
namespace tu_ns = dpctl::tensor::type_utils;
5052

53+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
54+
5155
template <typename argT1, typename argT2, typename resT> struct Atan2Functor
5256
{
5357

@@ -68,8 +72,8 @@ template <typename argT1, typename argT2, typename resT> struct Atan2Functor
6872
template <typename argT1,
6973
typename argT2,
7074
typename resT,
71-
unsigned int vec_sz = 4,
72-
unsigned int n_vecs = 2,
75+
unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
76+
unsigned int n_vecs = 1,
7377
bool enable_sg_loadstore = true>
7478
using Atan2ContigFunctor =
7579
elementwise_common::BinaryContigFunctor<argT1,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@
3030
#include <sycl/sycl.hpp>
3131
#include <type_traits>
3232

33-
#include "kernels/elementwise_functions/common.hpp"
3433
#include "sycl_complex.hpp"
34+
#include "vec_size_util.hpp"
3535

3636
#include "kernels/dpctl_tensor_types.hpp"
37+
#include "kernels/elementwise_functions/common.hpp"
38+
3739
#include "utils/offset_utils.hpp"
3840
#include "utils/type_dispatch_building.hpp"
3941
#include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace atanh
4951

5052
namespace td_ns = dpctl::tensor::type_dispatch;
5153

54+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5255
using dpctl::tensor::type_utils::is_complex;
5356

5457
template <typename argT, typename resT> struct AtanhFunctor
@@ -132,8 +135,8 @@ template <typename argT, typename resT> struct AtanhFunctor
132135

133136
template <typename argTy,
134137
typename resTy = argTy,
135-
unsigned int vec_sz = 4,
136-
unsigned int n_vecs = 2,
138+
unsigned int vec_sz = VecSize_v<argTy, resTy>,
139+
unsigned int n_vecs = 1,
137140
bool enable_sg_loadstore = true>
138141
using AtanhContigFunctor =
139142
elementwise_common::UnaryContigFunctor<argTy,

dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include <sycl/sycl.hpp>
2929
#include <type_traits>
3030

31+
#include "vec_size_util.hpp"
32+
3133
#include "utils/offset_utils.hpp"
3234
#include "utils/type_dispatch_building.hpp"
3335
#include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace bitwise_and
4850
namespace td_ns = dpctl::tensor::type_dispatch;
4951
namespace tu_ns = dpctl::tensor::type_utils;
5052

53+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
54+
5155
template <typename argT1, typename argT2, typename resT>
5256
struct BitwiseAndFunctor
5357
{
@@ -91,8 +95,8 @@ struct BitwiseAndFunctor
9195
template <typename argT1,
9296
typename argT2,
9397
typename resT,
94-
unsigned int vec_sz = 4,
95-
unsigned int n_vecs = 2,
98+
unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
99+
unsigned int n_vecs = 1,
96100
bool enable_sg_loadstore = true>
97101
using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
98102
argT1,
@@ -290,8 +294,8 @@ template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
290294

291295
template <typename argT,
292296
typename resT,
293-
unsigned int vec_sz = 4,
294-
unsigned int n_vecs = 2,
297+
unsigned int vec_sz = VecSize_v<argT, resT>,
298+
unsigned int n_vecs = 1,
295299
bool enable_sg_loadstore = true>
296300
using BitwiseAndInplaceContigFunctor =
297301
elementwise_common::BinaryInplaceContigFunctor<

dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include <sycl/sycl.hpp>
3131
#include <type_traits>
3232

33+
#include "vec_size_util.hpp"
34+
3335
#include "utils/offset_utils.hpp"
3436
#include "utils/type_dispatch_building.hpp"
3537
#include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace bitwise_invert
4951
namespace td_ns = dpctl::tensor::type_dispatch;
5052
namespace tu_ns = dpctl::tensor::type_utils;
5153

54+
using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
5255
using dpctl::tensor::type_utils::vec_cast;
5356

5457
template <typename argT, typename resT> struct BitwiseInvertFunctor
@@ -80,8 +83,8 @@ template <typename argT, typename resT> struct BitwiseInvertFunctor
8083

8184
template <typename argT,
8285
typename resT = argT,
83-
unsigned int vec_sz = 4,
84-
unsigned int n_vecs = 2,
86+
unsigned int vec_sz = VecSize_v<argT, resT>,
87+
unsigned int n_vecs = 1,
8588
bool enable_sg_loadstore = true>
8689
using BitwiseInvertContigFunctor =
8790
elementwise_common::UnaryContigFunctor<argT,

0 commit comments

Comments
 (0)