Skip to content

Commit 8a27ef0

Browse files
committed
move norm to norm.cpp file
1 parent fb7dd64 commit 8a27ef0

File tree

5 files changed

+461
-417
lines changed

5 files changed

+461
-417
lines changed

ggml-sycl/norm.cpp

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
#include "norm.hpp"
2+
3+
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
4+
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
5+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
6+
item_ct1.get_local_id(1);
7+
const int tid = item_ct1.get_local_id(2);
8+
9+
const int nthreads = item_ct1.get_local_range(2);
10+
const int nwarps = nthreads / WARP_SIZE;
11+
assert(nwarps % WARP_SIZE == 0);
12+
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
13+
14+
for (int col = tid; col < ncols; col += block_size) {
15+
const float xi = x[row * ncols + col];
16+
mean_var.x() += xi;
17+
mean_var.y() += xi * xi;
18+
}
19+
20+
// sum up partial sums
21+
mean_var = warp_reduce_sum(mean_var, item_ct1);
22+
if (block_size > WARP_SIZE) {
23+
24+
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
25+
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
26+
if (lane_id == 0) {
27+
s_sum[warp_id] = mean_var;
28+
}
29+
/*
30+
DPCT1118:0: SYCL group functions and algorithms must be encountered in
31+
converged control flow. You may need to adjust the code.
32+
*/
33+
item_ct1.barrier(sycl::access::fence_space::local_space);
34+
mean_var = 0.f;
35+
int nreduce = nwarps / WARP_SIZE;
36+
for (size_t i = 0; i < nreduce; i += 1)
37+
{
38+
mean_var += s_sum[lane_id + i * WARP_SIZE];
39+
}
40+
mean_var = warp_reduce_sum(mean_var, item_ct1);
41+
}
42+
43+
const float mean = mean_var.x() / ncols;
44+
const float var = mean_var.y() / ncols - mean * mean;
45+
const float inv_std = sycl::rsqrt(var + eps);
46+
47+
for (int col = tid; col < ncols; col += block_size) {
48+
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
49+
}
50+
}
51+
52+
static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps,
53+
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
54+
int start = item_ct1.get_group(2) * group_size;
55+
int end = start + group_size;
56+
const int nthreads = item_ct1.get_local_range(2);
57+
const int nwarps = nthreads / WARP_SIZE;
58+
assert(nwarps % WARP_SIZE == 0);
59+
start += item_ct1.get_local_id(2);
60+
61+
if (end >= ne_elements) {
62+
end = ne_elements;
63+
}
64+
65+
float tmp = 0.0f; // partial sum for thread in warp
66+
67+
for (int j = start; j < end; j += block_size) {
68+
tmp += x[j];
69+
}
70+
71+
tmp = warp_reduce_sum(tmp, item_ct1);
72+
if (block_size > WARP_SIZE) {
73+
74+
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
75+
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
76+
if (lane_id == 0) {
77+
s_sum[warp_id] = tmp;
78+
}
79+
/*
80+
DPCT1118:1: SYCL group functions and algorithms must be encountered in
81+
converged control flow. You may need to adjust the code.
82+
*/
83+
/*
84+
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
85+
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
86+
better performance if there is no access to global memory.
87+
*/
88+
item_ct1.barrier();
89+
tmp = 0.f;
90+
int nreduce = nwarps / WARP_SIZE;
91+
for (size_t i = 0; i < nreduce; i += 1)
92+
{
93+
tmp += s_sum[lane_id + i * WARP_SIZE];
94+
}
95+
tmp = warp_reduce_sum(tmp, item_ct1);
96+
}
97+
98+
float mean = tmp / group_size;
99+
tmp = 0.0f;
100+
101+
for (int j = start; j < end; j += block_size) {
102+
float xi = x[j] - mean;
103+
dst[j] = xi;
104+
tmp += xi * xi;
105+
}
106+
107+
tmp = warp_reduce_sum(tmp, item_ct1);
108+
if (block_size > WARP_SIZE) {
109+
110+
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
111+
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
112+
if (lane_id == 0) {
113+
s_sum[warp_id] = tmp;
114+
}
115+
/*
116+
DPCT1118:2: SYCL group functions and algorithms must be encountered in
117+
converged control flow. You may need to adjust the code.
118+
*/
119+
/*
120+
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
121+
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
122+
better performance if there is no access to global memory.
123+
*/
124+
item_ct1.barrier();
125+
tmp = s_sum[lane_id];
126+
tmp = warp_reduce_sum(tmp, item_ct1);
127+
}
128+
129+
float variance = tmp / group_size;
130+
float scale = sycl::rsqrt(variance + eps);
131+
for (int j = start; j < end; j += block_size) {
132+
dst[j] *= scale;
133+
}
134+
}
135+
136+
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
137+
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
138+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
139+
item_ct1.get_local_id(1);
140+
const int tid = item_ct1.get_local_id(2);
141+
const int nthreads = item_ct1.get_local_range(2);
142+
const int nwarps = nthreads / WARP_SIZE;
143+
assert(nwarps % WARP_SIZE == 0);
144+
float tmp = 0.0f; // partial sum for thread in warp
145+
146+
for (int col = tid; col < ncols; col += block_size) {
147+
const float xi = x[row * ncols + col];
148+
tmp += xi * xi;
149+
}
150+
151+
// sum up partial sums
152+
tmp = warp_reduce_sum(tmp, item_ct1);
153+
if (block_size > WARP_SIZE) {
154+
155+
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
156+
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
157+
if (lane_id == 0) {
158+
s_sum[warp_id] = tmp;
159+
}
160+
/*
161+
DPCT1118:3: SYCL group functions and algorithms must be encountered in
162+
converged control flow. You may need to adjust the code.
163+
*/
164+
item_ct1.barrier(sycl::access::fence_space::local_space);
165+
int nreduce = nwarps / WARP_SIZE;
166+
tmp = 0.f;
167+
for (size_t i = 0; i < nreduce; i += 1)
168+
{
169+
tmp += s_sum[lane_id + i * WARP_SIZE];
170+
}
171+
tmp = warp_reduce_sum(tmp, item_ct1);
172+
}
173+
174+
const float mean = tmp / ncols;
175+
const float scale = sycl::rsqrt(mean + eps);
176+
177+
for (int col = tid; col < ncols; col += block_size) {
178+
dst[row * ncols + col] = scale * x[row * ncols + col];
179+
}
180+
}
181+
182+
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
183+
const int nrows, const float eps,
184+
queue_ptr stream) {
185+
GGML_ASSERT(ncols % WARP_SIZE == 0);
186+
if (ncols < 1024) {
187+
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
188+
stream->submit([&](sycl::handler& cgh) {
189+
cgh.parallel_for(
190+
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
191+
block_dims),
192+
[=](sycl::nd_item<3> item_ct1)
193+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
194+
norm_f32(x, dst, ncols, eps, item_ct1,
195+
nullptr, WARP_SIZE);
196+
});
197+
});
198+
}
199+
else {
200+
const int work_group_size = get_work_group_size(stream->get_device());
201+
const sycl::range<3> block_dims(1, 1, work_group_size);
202+
/*
203+
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
204+
the limit. To get the device limit, query
205+
info::device::max_work_group_size. Adjust the work-group size if needed.
206+
*/
207+
stream->submit([&](sycl::handler& cgh) {
208+
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
209+
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
210+
211+
cgh.parallel_for(
212+
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
213+
block_dims),
214+
[=](sycl::nd_item<3> item_ct1)
215+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
216+
norm_f32(x, dst, ncols, eps, item_ct1,
217+
s_sum_acc_ct1.get_pointer(), work_group_size);
218+
});
219+
});
220+
}
221+
}
222+
223+
static void group_norm_f32_sycl(const float* x, float* dst,
224+
const int num_groups, const int group_size,
225+
const int ne_elements, queue_ptr stream) {
226+
static const float eps = 1e-6f;
227+
if (group_size < 1024) {
228+
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
229+
stream->submit([&](sycl::handler& cgh) {
230+
const float eps_ct4 = eps;
231+
cgh.parallel_for(
232+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
233+
block_dims),
234+
[=](sycl::nd_item<3> item_ct1)
235+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
236+
group_norm_f32(
237+
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
238+
nullptr, WARP_SIZE);
239+
});
240+
});
241+
}
242+
else {
243+
const int work_group_size = get_work_group_size(stream->get_device());
244+
const sycl::range<3> block_dims(1, 1, work_group_size);
245+
/*
246+
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
247+
the limit. To get the device limit, query
248+
info::device::max_work_group_size. Adjust the work-group size if needed.
249+
*/
250+
251+
stream->submit([&](sycl::handler& cgh) {
252+
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
253+
cgh);
254+
255+
const float eps_ct4 = eps;
256+
257+
cgh.parallel_for(
258+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
259+
block_dims),
260+
[=](sycl::nd_item<3> item_ct1)
261+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
262+
group_norm_f32(x, dst, group_size, ne_elements,
263+
eps_ct4, item_ct1,
264+
s_sum_acc_ct1.get_pointer(), work_group_size);
265+
});
266+
});
267+
}
268+
}
269+
270+
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
271+
const int nrows, const float eps,
272+
queue_ptr stream) {
273+
GGML_ASSERT(ncols % WARP_SIZE == 0);
274+
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
275+
if (ncols < 1024) {
276+
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
277+
stream->submit([&](sycl::handler& cgh) {
278+
cgh.parallel_for(
279+
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
280+
block_dims),
281+
[=](sycl::nd_item<3> item_ct1)
282+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
283+
rms_norm_f32(x, dst, ncols, eps, item_ct1,
284+
nullptr, WARP_SIZE);
285+
});
286+
});
287+
}
288+
else {
289+
const int work_group_size = get_work_group_size(stream->get_device());
290+
const sycl::range<3> block_dims(1, 1, work_group_size);
291+
/*
292+
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
293+
the limit. To get the device limit, query
294+
info::device::max_work_group_size. Adjust the work-group size if needed.
295+
*/
296+
stream->submit([&](sycl::handler& cgh) {
297+
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
298+
cgh);
299+
cgh.parallel_for(
300+
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
301+
block_dims),
302+
[=](sycl::nd_item<3> item_ct1)
303+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
304+
rms_norm_f32(x, dst, ncols, eps, item_ct1,
305+
s_sum_acc_ct1.get_pointer(), work_group_size);
306+
});
307+
});
308+
}
309+
}
310+
311+
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
312+
ggml_tensor* dst, const float* src0_dd,
313+
const float* src1_dd, float* dst_dd,
314+
const queue_ptr& main_stream) {
315+
316+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
317+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
318+
319+
const int64_t ne00 = src0->ne[0];
320+
const int64_t nrows = ggml_nrows(src0);
321+
322+
float eps;
323+
memcpy(&eps, dst->op_params, sizeof(float));
324+
325+
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
326+
327+
(void)src1;
328+
(void)dst;
329+
(void)src1_dd;
330+
}
331+
332+
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
333+
const ggml_tensor* src1, ggml_tensor* dst,
334+
const float* src0_dd, const float* src1_dd,
335+
float* dst_dd,
336+
const queue_ptr& main_stream) {
337+
338+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
339+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
340+
341+
int num_groups = dst->op_params[0];
342+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
343+
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
344+
345+
(void)src1;
346+
(void)dst;
347+
(void)src1_dd;
348+
}
349+
350+
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
351+
const ggml_tensor* src1, ggml_tensor* dst,
352+
const float* src0_dd, const float* src1_dd,
353+
float* dst_dd,
354+
const queue_ptr& main_stream) {
355+
356+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
357+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
358+
359+
const int64_t ne00 = src0->ne[0];
360+
const int64_t nrows = ggml_nrows(src0);
361+
362+
float eps;
363+
memcpy(&eps, dst->op_params, sizeof(float));
364+
365+
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
366+
367+
(void)src1;
368+
(void)dst;
369+
(void)src1_dd;
370+
}

0 commit comments

Comments
 (0)