Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 56c3b55

Browse files
Part3. Manual replacement of block_load
1 parent be5f269 commit 56c3b55

21 files changed

+84
-47
lines changed

SYCL/ESIMD/BitonicSortK.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ ESIMD_INLINE simd<ty, size> cmk_read(AccTy buf, uint32_t offset) {
4343
offset *= sizeof(ty);
4444
#pragma unroll
4545
for (uint32_t i = 0; i < size; i += 32) {
46-
v.template select<32, 1>(i) = block_load<ty, 32, AccTy>(buf, offset);
46+
simd<ty, 32> data;
47+
data.copy_from(buf, offset);
48+
v.template select<32, 1>(i) = data;
4749
offset += 32 * sizeof(ty);
4850
}
4951
return v;

SYCL/ESIMD/BitonicSortKv2.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ ESIMD_INLINE simd<ty, size> cmk_read(ty *buf, uint32_t offset) {
4343
simd<ty, size> v;
4444
#pragma unroll
4545
for (uint32_t i = 0; i < size; i += 32) {
46-
v.template select<32, 1>(i) = block_load<ty, 32>(buf + offset + i);
46+
simd<ty, 32> data;
47+
data.copy_from(buf + offset + i);
48+
v.template select<32, 1>(i) = data;
4749
}
4850
return v;
4951
}

SYCL/ESIMD/Prefix_Local_sum1.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,18 @@ void cmk_sum_tuple_count(unsigned int *buf, unsigned int h_pos) {
7070
simd<unsigned, 32 * TUPLE_SZ> S, T;
7171
#pragma unroll
7272
for (int i = 0; i < TUPLE_SZ; i++) {
73-
S.select<32, 1>(i * 32) = block_load<unsigned, 32>(buf + offset + i * 32);
73+
simd<unsigned, 32> data;
74+
data.copy_from(buf + offset + i * 32);
75+
S.select<32, 1>(i * 32) = data;
7476
}
7577

7678
#pragma unroll
7779
for (int i = 1; i < PREFIX_ENTRIES / 32; i++) {
7880
#pragma unroll
7981
for (int j = 0; j < TUPLE_SZ; j++) {
80-
T.select<32, 1>(j * 32) =
81-
block_load<unsigned, 32>(buf + offset + i * 32 * TUPLE_SZ + j * 32);
82+
simd<unsigned, 32> data;
83+
data.copy_from(buf + offset + i * 32 * TUPLE_SZ + j * 32);
84+
T.select<32, 1>(j * 32) = data;
8285
}
8386
S += T;
8487
}
@@ -109,7 +112,7 @@ void cmk_sum_tuple_count(unsigned int *buf, unsigned int h_pos) {
109112
// This is a ULT test variant of PrefixSum kernel with different implementation
110113
// to increase test coverage of different usage cases and help isolate bugs.
111114
// Difference from PrefixSum kernel:
112-
// - Use block_load<>() to read in data
115+
// - Use copy_from<>() to read in data
113116
// - Use scatter<>() to write output
114117
//************************************
115118
int main(int argc, char *argv[]) {

SYCL/ESIMD/Stencil.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,9 @@ int main(int argc, char *argv[]) {
123123
unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
124124
#pragma unroll
125125
for (unsigned i = 0; i < 10; i++) {
126-
in.row(i) = block_load<float, 32>(inputMatrix + off);
126+
simd<float, 32> data;
127+
data.copy_from(inputMatrix + off);
128+
in.row(i) = data;
127129
off += DIM_SIZE;
128130
}
129131

@@ -134,8 +136,9 @@ int main(int argc, char *argv[]) {
134136

135137
#pragma unroll
136138
for (unsigned i = 0; i < HEIGHT; i++) {
137-
138-
in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
139+
simd<float, 32> data;
140+
data.copy_from(inputMatrix + off);
141+
in.row(10 + i) = data;
139142
off += DIM_SIZE;
140143

141144
simd<float, WIDTH> sum =

SYCL/ESIMD/dp4a.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,13 @@ int main(void) {
6060
using namespace sycl::ext::intel::experimental::esimd;
6161

6262
simd<DTYPE, SIZE> src0(0);
63-
src0 = block_load<DTYPE, SIZE>(S0);
63+
src0.copy_from(S0);
6464

6565
simd<DTYPE, SIZE> src1(0);
66-
src1 = block_load<DTYPE, SIZE>(S1);
66+
src1.copy_from(S1);
6767

6868
simd<DTYPE, SIZE> src2(0);
69-
src2 = block_load<DTYPE, SIZE>(S2);
69+
src2.copy_from(S2);
7070

7171
auto res =
7272
esimd_dp4a<DTYPE, DTYPE, DTYPE, DTYPE, SIZE>(src0, src1, src2);

SYCL/ESIMD/fp_args_size/Inputs/fp_args_size_common.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,12 @@ int main(void) {
6565
simd<a_data_t, SIZE> va(0);
6666
simd<b_data_t, SIZE> vb(0);
6767
for (int j = 0; j < ROWS; j++) {
68-
va.select<VL, 1>(j * VL) = block_load<a_data_t, VL>(A + j * VL);
69-
vb.select<VL, 1>(j * VL) = block_load<b_data_t, VL>(B + j * VL);
68+
simd<a_data_t, VL> a_data;
69+
a_data.copy_from(A + j * VL);
70+
va.select<VL, 1>(j * VL) = a_data;
71+
simd<b_data_t, VL> b_data;
72+
b_data.copy_from(B + j * VL);
73+
vb.select<VL, 1>(j * VL) = b_data;
7074
}
7175

7276
auto foo = &add<simd<a_data_t, SIZE>, simd<b_data_t, SIZE>,

SYCL/ESIMD/histogram.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,10 @@ int main(int argc, char *argv[]) {
203203
bins, offset, src, 1);
204204
offset += 8 * sizeof(unsigned int);
205205
#else
206-
auto vals = block_load<unsigned int, 8>(bins + i);
207-
vals = vals + src;
208-
vals.copy_to(bins + i);
206+
simd<unsigned int, 8> vals;
207+
vals.copy_from(bins + i);
208+
vals = vals + src;
209+
vals.copy_to(bins + i);
209210
#endif
210211
}
211212
});

SYCL/ESIMD/histogram_256_slm.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
4343
auto start_off = (linear_id * BLOCK_WIDTH * NUM_BLOCKS);
4444
for (int y = 0; y < NUM_BLOCKS; y++) {
4545
auto start_addr = ((unsigned int *)input_ptr) + start_off;
46-
auto data = block_load<uint, 32>(start_addr);
46+
simd<uint, 32> data;
47+
data.copy_from(start_addr);
4748
auto in = data.format<uchar>();
4849

4950
#pragma unroll

SYCL/ESIMD/histogram_256_slm_spec.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
4444
auto start_off = (linear_id * BLOCK_WIDTH * num_blocks);
4545
for (int y = 0; y < num_blocks; y++) {
4646
auto start_addr = ((unsigned int *)input_ptr) + start_off;
47-
auto data = block_load<uint, 32>(start_addr);
47+
simd<uint, 32> data;
48+
data.copy_from(start_addr);
4849
auto in = data.format<uchar>();
4950

5051
#pragma unroll

SYCL/ESIMD/histogram_2d.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,8 @@ int main(int argc, char *argv[]) {
195195
bins, offset, src, 1);
196196
offset += 8 * sizeof(unsigned int);
197197
#else
198-
auto vals = block_load<unsigned int, 8>(bins + i);
198+
simd<unsigned int, 8> vals;
199+
vals.copy_from(bins + i);
199200
vals = vals + src;
200201
vals.copy_to(bins + i);
201202
#endif

SYCL/ESIMD/histogram_raw_send.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,8 @@ int main(int argc, char *argv[]) {
220220
bins, offset, src, 1);
221221
offset += 8 * sizeof(unsigned int);
222222
#else
223-
auto vals = block_load<unsigned int, 8>(bins + i);
223+
simd<unsigned int, 8> vals;
224+
vals.copy_from(bins + i);
224225
vals = vals + src;
225226
vals.copy_to(bins + i);
226227
#endif

SYCL/ESIMD/kmeans/kmeans.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,9 @@ int main(int argc, char *argv[]) {
243243

244244
#pragma unroll
245245
for (int i = 0; i < NUM_CENTROIDS_ALLOCATED / SIMD_SIZE; i++) {
246-
centroidsXYXY.row(i) =
247-
block_load<float, 2 * SIMD_SIZE>(kcentroids4[i].xyn);
246+
simd<float, 2 * SIMD_SIZE> data;
247+
data.copy_from(kcentroids4[i].xyn);
248+
centroidsXYXY.row(i) = data;
248249
}
249250

250251
simd<float, NUM_CENTROIDS_ALLOCATED> accumxsum(0);
@@ -269,8 +270,7 @@ int main(int argc, char *argv[]) {
269270
auto pointsXY = points.format<float, 2, SIMD_SIZE>();
270271
simd<int, SIMD_SIZE> cluster(0);
271272

272-
points =
273-
block_load<float, 2 * SIMD_SIZE>(kpoints4[index + i].xyn);
273+
points.copy_from(kpoints4[index + i].xyn);
274274

275275
simd<float, SIMD_SIZE> dx =
276276
pointsXY.row(0) - centroidsXY.row(0)[0];
@@ -355,8 +355,7 @@ int main(int argc, char *argv[]) {
355355
simd<float, SIMD_SIZE> t;
356356
t.copy_from(kaccum4[it.get_global_id(0)].x_sum + offset);
357357
xsum += t;
358-
t = block_load<float, SIMD_SIZE>(
359-
kaccum4[it.get_global_id(0)].y_sum + offset);
358+
t.copy_from(kaccum4[it.get_global_id(0)].y_sum + offset);
360359
ysum += t;
361360
simd<int, SIMD_SIZE> n;
362361
n.copy_from(kaccum4[it.get_global_id(0)].num_points + offset);

SYCL/ESIMD/matrix_transpose.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,9 @@ ESIMD_INLINE simd<T, N * N> read(AccessorTy buf, int MZ, int col, int row) {
115115
uint32_t offset = (row * MZ + col) * sizeof(T);
116116
#pragma unroll
117117
for (int i = 0; i < N; ++i) {
118-
res.template select<N, 1>(i * N) = block_load<T, N>(buf, offset);
118+
simd<T, N> data;
119+
data.copy_from(buf, offset);
120+
res.template select<N, 1>(i * N) = data;
119121
offset += MZ * sizeof(T);
120122
}
121123
return res;

SYCL/ESIMD/matrix_transpose_glb.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,9 @@ ESIMD_NOINLINE void read(int *buf, int MZ, int col, int row, int GrfIdx) {
117117
buf += row * MZ + col;
118118
#pragma unroll
119119
for (int i = 0; i < N; ++i) {
120-
res.template select<N, 1>(i * N) = block_load<int, N>(buf);
120+
simd<int, N> data;
121+
data.copy_from(buf);
122+
res.template select<N, 1>(i * N) = data;
121123
buf += MZ;
122124
}
123125
}

SYCL/ESIMD/matrix_transpose_usm.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ ESIMD_INLINE simd<T, N * N> read(T *buf, int MZ, int col, int row) {
116116
buf += row * MZ + col;
117117
#pragma unroll
118118
for (int i = 0; i < N; ++i) {
119-
res.template select<N, 1>(i * N) = block_load<T, N>(buf);
119+
simd<T, N> data;
120+
data.copy_from(buf);
121+
res.template select<N, 1>(i * N) = data;
120122
buf += MZ;
121123
}
122124
return res;

SYCL/ESIMD/noinline_args_size/Inputs/noinline_args_size_common.hpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,20 @@ int main(void) {
6464
simd<a_data_t, SIZE> va(0);
6565
simd<b_data_t, SIZE> vb(0);
6666
for (int j = 0; j < ROWS; j++) {
67-
va.select<VL, 1>(j * VL) = block_load<a_data_t, VL>(A + j * VL);
68-
vb.select<VL, 1>(j * VL) = block_load<b_data_t, VL>(B + j * VL);
67+
simd<a_data_t, VL> a_data;
68+
a_data.copy_from(A + j * VL);
69+
va.select<VL, 1>(j * VL) = a_data;
70+
simd<b_data_t, VL> b_data;
71+
b_data.copy_from(B + j * VL);
72+
vb.select<VL, 1>(j * VL) = b_data;
6973
}
7074

7175
auto vc = add<simd<a_data_t, SIZE>, simd<b_data_t, SIZE>,
7276
simd<c_data_t, SIZE>>(va, vb);
7377

7478
for (int j = 0; j < ROWS; j++)
75-
block_store<c_data_t, VL>(C + j * VL, vc.select<VL, 1>(j * VL));
79+
simd<c_data_t, VL> vals =
80+
vc.select<VL, 1>(j * VL) vals.copy_to(C + j * VL);
7681
});
7782
});
7883

SYCL/ESIMD/regression/dgetrf.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,11 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) {
171171

172172
// load P1
173173
for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda)
174-
for (i = 0; i < M; i += 8)
175-
V8(p1, j * M + i) = block_load<double, 8>(a1 + i);
174+
for (i = 0; i < M; i += 8) {
175+
simd<double, 8> data;
176+
data.copy_from(a1 + i);
177+
V8(p1, j * M + i) = data;
178+
}
176179

177180
if (K > 0) {
178181
// (trsm) solve F*X=U for X, X overwrites U
@@ -182,7 +185,9 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) {
182185
simd<double, 8> a0k, aik;
183186
for (k = 0; k < 8 && kk + k < K; k++) {
184187
V1(mask, k) = 0;
185-
V8(a0k, 0) = block_load<double, 8>(a + kk + (kk + k) * lda);
188+
simd<double, 8> data;
189+
data.copy_from(a + kk + (kk + k) * lda);
190+
V8(a0k, 0) = data;
186191
for (j = 0; j < N; j++) {
187192
auto aj = V(p1, M, j * M);
188193
auto aj0 = V8(aj, kk);
@@ -192,7 +197,9 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) {
192197
}
193198
for (k = 0; k < 8 && kk + k < K; k++) {
194199
for (i = kk + 8; i < M; i += 8) {
195-
V8(aik, 0) = block_load<double, 8>(a + i + (kk + k) * lda);
200+
simd<double, 8> data;
201+
data.copy_from(a + i + (kk + k) * lda);
202+
V8(aik, 0) = data;
196203
for (j = 0; j < N; j++) {
197204
auto aj = V(p1, M, j * M);
198205
auto aj0 = V8(aj, kk);

SYCL/ESIMD/regression/unused_load.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
// RUN: %GPU_RUN_PLACEHOLDER %t.out
1212

1313
// This test checks that ESIMD JIT compilation does not crash on unused
14-
// block_load invocation.
14+
// copy_from invocation.
1515

1616
#include <CL/sycl.hpp>
1717
#include <CL/sycl/INTEL/esimd.hpp>

SYCL/ESIMD/slm_barrier.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr,
4848
simd<uint, 32> row0; // 32 floats or 128 Bytes or 4 GRF-registers
4949
simd<uint, 32> row1;
5050
simd<uint, 64> rowTrans;
51-
row0 = block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory));
52-
row1 =
53-
block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory + 128));
51+
row0.copy_from((const uint *)(addr + threadOffsetInMemory));
52+
row1.copy_from((const uint *)(addr + threadOffsetInMemory + 128));
5453

5554
// Transpose
5655
rowTrans.select<8, 1>(0) = row0.select<8, 4>(0);

SYCL/ESIMD/slm_split_barrier.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr,
4848
simd<uint, 32> row0; // 32 floats or 128 Bytes or 4 GRF-registers
4949
simd<uint, 32> row1;
5050
simd<uint, 64> rowTrans;
51-
row0 = block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory));
52-
row1 =
53-
block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory + 128));
51+
row0.copy_from((const uint *)(addr + threadOffsetInMemory));
52+
row1.copy_from((const uint *)(addr + threadOffsetInMemory + 128));
5453

5554
// Transpose
5655
rowTrans.select<8, 1>(0) = row0.select<8, 4>(0);

SYCL/ESIMD/stencil2.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,9 @@ int main(int argc, char *argv[]) {
125125
unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
126126
#pragma unroll
127127
for (unsigned i = 0; i < 10; i++) {
128-
in.row(i) = block_load<float, 32>(inputMatrix + off);
128+
simd<float, 32> data;
129+
data.copy_from(inputMatrix + off);
130+
in.row(i) = data;
129131
off += DIM_SIZE;
130132
}
131133

@@ -136,8 +138,9 @@ int main(int argc, char *argv[]) {
136138

137139
#pragma unroll
138140
for (unsigned i = 0; i < HEIGHT; i++) {
139-
140-
in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
141+
simd<float, 32> data;
142+
data.copy_from(inputMatrix + off);
143+
in.row(10 + i) = data;
141144
off += DIM_SIZE;
142145

143146
simd<float, WIDTH> sum =

0 commit comments

Comments
 (0)