Skip to content

Commit e9a1ace

Browse files
authored
[ESIMD] Optimize the simd stride constructor (#12553)
simd(base, stride) calls previously were lowered into a long sequence of INSERT and ADD operations. That sequence is replaced with a vector equivalent: vbase = broadcast base vstride = broadcast stride vstride_coef = {0, 1, 2, 3, ... N-1} vec_result = vbase + vstride * vstride_coef; --------- Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
1 parent 20aee78 commit e9a1ace

File tree

5 files changed

+87
-41
lines changed

5 files changed

+87
-41
lines changed

sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,23 @@ constexpr vector_type_t<T, N> make_vector(const T (&&Arr)[N]) {
124124
}
125125

126126
template <class T, int N, size_t... Is>
127-
constexpr vector_type_t<T, N> make_vector_impl(T Base, T Stride,
128-
std::index_sequence<Is...>) {
129-
return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
127+
constexpr auto make_vector_impl(T Base, T Stride, std::index_sequence<Is...>) {
128+
if constexpr (std::is_integral_v<T> && N <= 3) {
129+
// This sequence is a bit more efficient for integral types and N <= 3.
130+
return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
131+
} else {
132+
using CppT = typename element_type_traits<T>::EnclosingCppT;
133+
CppT BaseCpp = Base;
134+
CppT StrideCpp = Stride;
135+
vector_type_t<CppT, N> VBase = BaseCpp;
136+
vector_type_t<CppT, N> VStride = StrideCpp;
137+
vector_type_t<CppT, N> VStrideCoef{(CppT)(Is)...};
138+
vector_type_t<CppT, N> Result{VBase + VStride * VStrideCoef};
139+
return wrapper_type_converter<T>::template to_vector<N>(Result);
140+
}
130141
}
131142

132-
template <class T, int N>
133-
constexpr vector_type_t<T, N> make_vector(T Base, T Stride) {
143+
template <class T, int N> constexpr auto make_vector(T Base, T Stride) {
134144
return make_vector_impl<T, N>(Base, Stride, std::make_index_sequence<N>{});
135145
}
136146

@@ -265,18 +275,13 @@ class [[__sycl_detail__::__uses_aspects__(
265275
/// are initialized with the arithmetic progression defined by the arguments.
266276
/// For example, <code>simd<int, 4> x(1, 3)</code> will initialize x to the
267277
/// <code>{1, 4, 7, 10}</code> sequence.
268-
/// @param Val The start of the progression.
278+
/// If Ty is a floating-point type and \p Base or \p Step is +/-inf or nan,
279+
/// then this constructor has undefined behavior.
280+
/// @param Base The start of the progression.
269281
/// @param Step The step of the progression.
270-
simd_obj_impl(Ty Val, Ty Step) noexcept {
271-
__esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
272-
if constexpr (is_wrapper_elem_type_v<Ty> || !std::is_integral_v<Ty>) {
273-
for (int i = 0; i < N; ++i) {
274-
M_data[i] = bitcast_to_raw_type(Val);
275-
Val = binary_op<BinOp::add, Ty>(Val, Step);
276-
}
277-
} else {
278-
M_data = make_vector<Ty, N>(Val, Step);
279-
}
282+
simd_obj_impl(Ty Base, Ty Step) noexcept {
283+
__esimd_dbg_print(simd_obj_impl(Ty Base, Ty Step));
284+
M_data = make_vector<Ty, N>(Base, Step);
280285
}
281286

282287
/// Broadcast constructor. Given value is type-converted to the

sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -246,18 +246,8 @@ class run_test {
246246
});
247247
queue.wait_and_throw();
248248

249-
// Verify the base value was passed as-is
250-
if (!are_bitwise_equal(result[0], base_value)) {
251-
passed = false;
252-
log::fail(TestDescriptionT(data_type, BaseVal, Step),
253-
"Unexpected value at index 0, retrieved: ", result[0],
254-
", expected: ", base_value);
255-
}
256-
257-
// Verify the step value works as expected being passed to the fill
258-
// constructor.
259-
DataT expected_value = base_value;
260-
for (size_t i = 1; i < result.size(); ++i) {
249+
// Verify the the fill constructor.
250+
for (size_t i = 0; i < result.size(); ++i) {
261251
if constexpr (BaseVal == init_val::nan || Step == init_val::nan) {
262252

263253
if (!std::isnan(result[i])) {
@@ -268,7 +258,7 @@ class run_test {
268258
}
269259
} else {
270260

271-
expected_value += step_value;
261+
DataT expected_value = base_value + (DataT)i * step_value;
272262
if (!are_bitwise_equal(result[i], expected_value)) {
273263
passed = false;
274264
log::fail(TestDescriptionT(data_type, BaseVal, Step),

sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
// The test verifies that simd fill constructor has no precision differences.
1616
// The test do the following actions:
1717
// - call simd with predefined base and step values
18-
// - bitwise comparing that output[0] value is equal to base value and
19-
// output[i] is equal to output[i -1] + step_value
18+
// - bitwise comparing that output[i] is equal to base + i * step_value.
2019

2120
#include "ctor_fill.hpp"
2221

sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,22 @@ int main(int, char **) {
112112
}
113113
{
114114
const auto types = get_tested_types<tested_types::fp>();
115+
{
116+
const auto base_values =
117+
ctors::get_init_values_pack<init_val::negative>();
118+
const auto step_values =
119+
ctors::get_init_values_pack<init_val::positive>();
120+
passed &= for_all_combinations<ctors::run_test>(
121+
types, sizes, contexts, base_values, step_values, queue);
122+
}
123+
// The test cases below have never been guaranteed to work some certain
124+
// way with base and step values set to inf or non. They may or may not
125+
// work as expected by the checks in this test.
115126
{
116127
const auto base_values =
117128
ctors::get_init_values_pack<init_val::neg_inf>();
118-
const auto step_values = ctors::get_init_values_pack<init_val::max>();
129+
const auto step_values =
130+
ctors::get_init_values_pack<init_val::positive>();
119131
passed &= for_all_combinations<ctors::run_test>(
120132
types, sizes, contexts, base_values, step_values, queue);
121133
}

sycl/test/esimd/ctor_codegen.cpp

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,62 @@ SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION {
2424
// CHECK-NEXT: }
2525
}
2626

27-
// Base + step constructor, FP element type, loops exected - don't check.
28-
SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION {
29-
simd<double, 2> val(17, 3);
30-
return val;
27+
// Const base + step constructor, FP element type.
28+
SYCL_EXTERNAL auto double_base_step_const() SYCL_ESIMD_FUNCTION {
29+
// CHECK: define dso_local spir_func void @_Z22double_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
30+
return simd<double, 64>{1.0, 3.0};
31+
// CHECK: store <64 x double> <double 1.000000e+00, double 4.000000e+00, double 7.000000e+00, double 1.000000e+01, double 1.300000e+01, double 1.600000e+01, double 1.900000e+01, double 2.200000e+01, double 2.500000e+01, double 2.800000e+01, double 3.100000e+01, double 3.400000e+01, double 3.700000e+01, double 4.000000e+01, double 4.300000e+01, double 4.600000e+01, double 4.900000e+01, double 5.200000e+01, double 5.500000e+01, double 5.800000e+01, double 6.100000e+01, double 6.400000e+01, double 6.700000e+01, double 7.000000e+01, double 7.300000e+01, double 7.600000e+01, double 7.900000e+01, double 8.200000e+01, double 8.500000e+01, double 8.800000e+01, double 9.100000e+01, double 9.400000e+01, double 9.700000e+01, double 1.000000e+02, double 1.030000e+02, double 1.060000e+02, double 1.090000e+02, double 1.120000e+02, double 1.150000e+02, double 1.180000e+02, double 1.210000e+02, double 1.240000e+02, double 1.270000e+02, double 1.300000e+02, double 1.330000e+02, double 1.360000e+02, double 1.390000e+02, double 1.420000e+02, double 1.450000e+02, double 1.480000e+02, double 1.510000e+02, double 1.540000e+02, double 1.570000e+02, double 1.600000e+02, double 1.630000e+02, double 1.660000e+02, double 1.690000e+02, double 1.720000e+02, double 1.750000e+02, double 1.780000e+02, double 1.810000e+02, double 1.840000e+02, double 1.870000e+02, double 1.900000e+02>, ptr addrspace(4) %[[RES]]
32+
// CHECK-NEXT: ret void
33+
}
34+
35+
// Variable base + step constructor, FP element type.
36+
SYCL_EXTERNAL auto double_base_step_var(double base, double step) SYCL_ESIMD_FUNCTION {
37+
// CHECK: define dso_local spir_func void @_Z20double_base_step_vardd({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], double noundef %[[BASE:[a-zA-Z0-9_\.]+]], double noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
38+
return simd<double, 32>{base, step};
39+
// CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[BASE]], i64 0
40+
// CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[BASE_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer
41+
// CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[STEP]], i64 0
42+
// CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[STEP_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer
43+
// CHECK: %[[FMA_VEC:[a-zA-Z0-9_\.]+]] = tail call noundef <32 x double> @llvm.fmuladd.v32f64(<32 x double> %[[STEP_VEC]], <32 x double> <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00, double 5.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00, double 9.000000e+00, double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01, double 1.800000e+01, double 1.900000e+01, double 2.000000e+01, double 2.100000e+01, double 2.200000e+01, double 2.300000e+01, double 2.400000e+01, double 2.500000e+01, double 2.600000e+01, double 2.700000e+01, double 2.800000e+01, double 2.900000e+01, double 3.000000e+01, double 3.100000e+01>, <32 x double> %[[BASE_VEC]])
44+
// CHECK: store <32 x double> %[[FMA_VEC]], ptr addrspace(4) %[[RES]]
45+
// CHECK-NEXT: ret void
3146
}
3247

33-
// Base + step constructor, integer element type, no loops exected - check.
34-
SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION {
35-
// CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
36-
simd<int, 2> val(17, 3);
48+
// Const base + step constructor, integer element type.
49+
SYCL_EXTERNAL auto int_base_step_const() SYCL_ESIMD_FUNCTION {
50+
// CHECK: define dso_local spir_func void @_Z19int_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
51+
simd<int, 16> val(17, 3);
3752
return val;
38-
// CHECK: store <2 x i32> <i32 17, i32 20>, ptr addrspace(4) %[[RES]]
53+
// CHECK: store <16 x i32> <i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62>, ptr addrspace(4) %[[RES]]
3954
// CHECK-NEXT: ret void
4055
// CHECK-NEXT: }
4156
}
4257

58+
// Variable base + step constructor, integer element type.
59+
SYCL_EXTERNAL auto int_base_step_var(int base, int step) SYCL_ESIMD_FUNCTION {
60+
// CHECK: define dso_local spir_func void @_Z17int_base_step_varii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
61+
return simd<int, 32>{base, step};
62+
// CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[BASE]], i64 0
63+
// CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[BASE_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer
64+
// CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[STEP]], i64 0
65+
// CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[STEP_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer
66+
// CHECK: %[[MUL_VEC:[a-zA-Z0-9_\.]+]] = mul <32 x i32> %[[STEP_VEC]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
67+
// CHECK: %[[ADD_VEC:[a-zA-Z0-9_\.]+]] = add <32 x i32> %[[BASE_VEC]], %[[MUL_VEC]]
68+
// CHECK: store <32 x i32> %[[ADD_VEC]], ptr addrspace(4) %[[RES]]
69+
// CHECK-NEXT: ret void
70+
}
71+
72+
// Variable base + step constructor, integer element type.
73+
SYCL_EXTERNAL auto int_base_step_var_n2(int base, int step) SYCL_ESIMD_FUNCTION {
74+
// CHECK: define dso_local spir_func void @_Z20int_base_step_var_n2ii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
75+
return simd<int, 2>{base, step};
76+
// CHECK: %[[BASE_VEC_TMP1:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> poison, i32 %[[BASE]], i64 0
77+
// CHECK: %[[BASE_INC:[a-zA-Z0-9_\.]+]] = add nsw i32 %[[BASE]], %[[STEP]]
78+
// CHECK: %[[RESULT_VEC:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> %[[BASE_VEC_TMP1]], i32 %[[BASE_INC]], i64 1
79+
// CHECK: store <2 x i32> %[[RESULT_VEC]], ptr addrspace(4) %[[RES]]
80+
// CHECK-NEXT: ret void
81+
}
82+
4383
// Broadcast constructor, FP element type, no loops exected - check.
4484
SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION {
4585
// CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {

0 commit comments

Comments
 (0)