Skip to content

Commit 9b81b57

Browse files
committed
[SYCL] unify rope norm/neox
As per: #7634 Signed-off-by: Joe Todd <[email protected]>
1 parent a9cae48 commit 9b81b57

File tree

1 file changed

+68
-100
lines changed

1 file changed

+68
-100
lines changed

ggml-sycl.cpp

Lines changed: 68 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -8826,7 +8826,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
88268826
}
88278827

88288828
struct rope_corr_dims {
8829-
float v[4];
8829+
float v[2];
88308830
};
88318831

88328832
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
@@ -8850,29 +8850,38 @@ static void rope_yarn(
88508850
}
88518851

88528852
// rope == RoPE == rotary positional embedding
8853-
template<typename T, bool has_pos>
8854-
static void rope(
8855-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
8856-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
8857-
,
8853+
template<typename T, bool has_ff>
8854+
static void rope_norm(
8855+
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
8856+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
88588857
const sycl::nd_item<3> &item_ct1) {
8859-
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
8858+
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
88608859
item_ct1.get_local_id(1));
88618860

8862-
if (col >= ncols) {
8861+
if (i0 >= ne0) {
88638862
return;
88648863
}
88658864

88668865
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
88678866
item_ct1.get_local_id(2);
8868-
const int i = row*ncols + col;
8867+
8868+
if (i0 >= n_dims) {
8869+
const int i = row*ne0 + i0;
8870+
8871+
dst[i + 0] = x[i + 0];
8872+
dst[i + 1] = x[i + 1];
8873+
8874+
return;
8875+
}
8876+
8877+
const int i = row*ne0 + i0;
88698878
const int i2 = row/p_delta_rows;
88708879

8871-
const int p = has_pos ? pos[i2] : 0;
8872-
const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
8880+
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
8881+
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
88738882

88748883
float cos_theta, sin_theta;
8875-
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
8884+
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
88768885

88778886
const float x0 = x[i + 0];
88788887
const float x1 = x[i + 1];
@@ -8881,45 +8890,40 @@ static void rope(
88818890
dst[i + 1] = x0*sin_theta + x1*cos_theta;
88828891
}
88838892

8884-
template<typename T, bool has_pos, bool has_freq_facs>
8885-
static void rope_neox(
8886-
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
8887-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
8888-
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
8889-
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
8893+
template <typename T, bool has_ff>
8894+
static void rope_neox(const T *x, T *dst, int ne0, int n_dims,
8895+
const int32_t *pos, float freq_scale, int p_delta_rows,
8896+
float ext_factor, float attn_factor,
8897+
rope_corr_dims corr_dims, float theta_scale,
8898+
const float *freq_factors,
8899+
const sycl::nd_item<3> &item_ct1) {
8900+
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
88908901
item_ct1.get_local_id(1));
88918902

8892-
if (col >= ncols) {
8903+
if (i0 >= ne0) {
88938904
return;
88948905
}
88958906

88968907
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
88978908
item_ct1.get_local_id(2);
8898-
const int ib = col / n_dims;
8899-
const int ic = col % n_dims;
89008909

8901-
if (ib > 0) {
8902-
const int i = row*ncols + ib*n_dims + ic;
8910+
if (i0 >= n_dims) {
8911+
const int i = row*ne0 + i0;
89038912

89048913
dst[i + 0] = x[i + 0];
89058914
dst[i + 1] = x[i + 1];
89068915

89078916
return;
89088917
}
89098918

8910-
const int i = row*ncols + ib*n_dims + ic/2;
8919+
const int i = row*ne0 + i0/2;
89118920
const int i2 = row/p_delta_rows;
89128921

8913-
float cur_rot = inv_ndims * ic - ib;
8914-
8915-
const int p = has_pos ? pos[i2] : 0;
8916-
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
8917-
8918-
const float theta_base =
8919-
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
8922+
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
8923+
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
89208924

89218925
float cos_theta, sin_theta;
8922-
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
8926+
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
89238927

89248928
const float x0 = x[i + 0];
89258929
const float x1 = x[i + n_dims/2];
@@ -12375,15 +12379,18 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min,
1237512379
}
1237612380

1237712381
template <typename T>
12378-
static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
12382+
static void rope_norm_sycl(const T *x, T *dst, int ne0, int n_dims, int nr,
1237912383
const int32_t *pos, float freq_scale, int p_delta_rows,
1238012384
float freq_base, float ext_factor, float attn_factor,
12381-
rope_corr_dims corr_dims, dpct::queue_ptr stream) {
12382-
GGML_ASSERT(ncols % 2 == 0);
12385+
rope_corr_dims corr_dims, const float * freq_factors, dpct::queue_ptr stream) {
12386+
GGML_ASSERT(ne0 % 2 == 0);
1238312387
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
12384-
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12385-
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
12386-
if (pos == nullptr) {
12388+
const int n_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12389+
const sycl::range<3> block_nums(1, n_blocks_x, nr);
12390+
12391+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
12392+
12393+
if (freq_factors == nullptr) {
1238712394
/*
1238812395
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
1238912396
the limit. To get the device limit, query
@@ -12395,8 +12402,8 @@ static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
1239512402
stream->parallel_for(
1239612403
sycl::nd_range<3>(block_nums * block_dims, block_dims),
1239712404
[=](sycl::nd_item<3> item_ct1) {
12398-
rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
12399-
freq_base, ext_factor, attn_factor, corr_dims,
12405+
rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
12406+
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
1240012407
item_ct1);
1240112408
});
1240212409
} else {
@@ -12411,70 +12418,46 @@ static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
1241112418
stream->parallel_for(
1241212419
sycl::nd_range<3>(block_nums * block_dims, block_dims),
1241312420
[=](sycl::nd_item<3> item_ct1) {
12414-
rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
12415-
freq_base, ext_factor, attn_factor, corr_dims,
12421+
rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
12422+
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
1241612423
item_ct1);
1241712424
});
1241812425
}
1241912426
}
1242012427

1242112428
template <typename T>
12422-
static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
12429+
static void rope_neox_sycl(const T *x, T *dst, int ne0, int n_dims, int nr,
1242312430
const int32_t *pos, float freq_scale,
1242412431
int p_delta_rows, float freq_base, float ext_factor,
1242512432
float attn_factor, rope_corr_dims corr_dims,
1242612433
const float * freq_factors, dpct::queue_ptr stream) {
12427-
GGML_ASSERT(ncols % 2 == 0);
12434+
GGML_ASSERT(ne0 % 2 == 0);
1242812435
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
12429-
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12430-
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
12436+
const int n_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12437+
const sycl::range<3> block_nums(1, n_blocks_x, nr);
1243112438

1243212439
const float theta_scale = powf(freq_base, -2.0f/n_dims);
12433-
const float inv_ndims = -1.0f / n_dims;
1243412440

12435-
if (pos == nullptr) {
1243612441
dpct::has_capability_or_fail(stream->get_device(),
1243712442
{sycl::aspect::fp16});
1243812443
if (freq_factors == nullptr) {
1243912444
stream->parallel_for(
1244012445
sycl::nd_range<3>(block_nums * block_dims, block_dims),
1244112446
[=](sycl::nd_item<3> item_ct1) {
12442-
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
12447+
rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
1244312448
p_delta_rows, ext_factor, attn_factor,
12444-
corr_dims, theta_scale, inv_ndims, freq_factors,
12449+
corr_dims, theta_scale, freq_factors,
1244512450
item_ct1);
1244612451
});
1244712452
} else {
1244812453
stream->parallel_for(
1244912454
sycl::nd_range<3>(block_nums * block_dims, block_dims),
1245012455
[=](sycl::nd_item<3> item_ct1) {
12451-
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
12456+
rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
1245212457
p_delta_rows, ext_factor, attn_factor,
12453-
corr_dims, theta_scale, inv_ndims, freq_factors,
12458+
corr_dims, theta_scale, freq_factors,
1245412459
item_ct1);
1245512460
});
12456-
}
12457-
} else {
12458-
dpct::has_capability_or_fail(stream->get_device(),
12459-
{sycl::aspect::fp16});
12460-
12461-
if (freq_factors == nullptr) {
12462-
stream->parallel_for(
12463-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12464-
[=](sycl::nd_item<3> item_ct1) {
12465-
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
12466-
p_delta_rows, ext_factor, attn_factor,
12467-
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
12468-
});
12469-
} else {
12470-
stream->parallel_for(
12471-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12472-
[=](sycl::nd_item<3> item_ct1) {
12473-
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
12474-
p_delta_rows, ext_factor, attn_factor,
12475-
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
12476-
});
12477-
}
1247812461
}
1247912462
}
1248012463

@@ -14005,8 +13988,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
1400513988

1400613989
const int64_t ne00 = src0->ne[0];
1400713990
const int64_t ne01 = src0->ne[1];
14008-
const int64_t ne2 = dst->ne[2];
14009-
const int64_t nrows = ggml_nrows(src0);
13991+
const int64_t nr = ggml_nrows(src0);
1401013992

1401113993
//const int n_past = ((int32_t *) dst->op_params)[0];
1401213994
const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -14023,27 +14005,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
1402314005
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1402414006
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1402514007

14026-
const float * freq_factors = nullptr;
14027-
const int32_t * pos = nullptr;
14028-
if ((mode & 1) == 0) {
14029-
GGML_ASSERT(src1->type == GGML_TYPE_I32);
14030-
GGML_ASSERT(src1->ne[0] == ne2);
14031-
pos = (const int32_t *) src1_dd;
14032-
}
14033-
1403414008
const bool is_neox = mode & 2;
1403514009

14036-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
14037-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
14038-
14039-
if (is_neox) {
14040-
pos = (const int32_t *) src1_dd;
14010+
const int32_t * pos = (const int32_t *) src1_dd;
1404114011

14012+
const float * freq_factors = nullptr;
1404214013
if (src2 != nullptr) {
1404314014
freq_factors = (const float *) src2->data;
14044-
}
14045-
} else {
14046-
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
1404714015
}
1404814016

1404914017
rope_corr_dims corr_dims;
@@ -14053,27 +14021,27 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
1405314021
if (is_neox) {
1405414022
if (src0->type == GGML_TYPE_F32) {
1405514023
rope_neox_sycl(
14056-
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14024+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
1405714025
attn_factor, corr_dims, freq_factors, main_stream
1405814026
);
1405914027
} else if (src0->type == GGML_TYPE_F16) {
1406014028
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
14061-
ne00, n_dims, nrows, pos, freq_scale, ne01,
14029+
ne00, n_dims, nr, pos, freq_scale, ne01,
1406214030
freq_base, ext_factor, attn_factor, corr_dims,
1406314031
freq_factors, main_stream);
1406414032
} else {
1406514033
GGML_ASSERT(false);
1406614034
}
1406714035
} else {
1406814036
if (src0->type == GGML_TYPE_F32) {
14069-
rope_sycl(
14070-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14071-
attn_factor, corr_dims, main_stream
14037+
rope_norm_sycl(
14038+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
14039+
attn_factor, corr_dims, freq_factors, main_stream
1407214040
);
1407314041
} else if (src0->type == GGML_TYPE_F16) {
14074-
rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
14075-
nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14076-
attn_factor, corr_dims, main_stream);
14042+
rope_norm_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
14043+
n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
14044+
attn_factor, corr_dims, freq_factors, main_stream);
1407714045
} else {
1407814046
GGML_ASSERT(false);
1407914047
}

0 commit comments

Comments
 (0)