Skip to content

Commit b2c8c83

Browse files
authored
Merge pull request #7919 from ggerganov/codeplay/unify-rope-sycl
sycl-exp : unify rope neox/norm
2 parents a9cae48 + ded54b5 commit b2c8c83

File tree

1 file changed

+94
-122
lines changed

1 file changed

+94
-122
lines changed

ggml-sycl.cpp

Lines changed: 94 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -8826,7 +8826,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
88268826
}
88278827

88288828
struct rope_corr_dims {
8829-
float v[4];
8829+
float v[2];
88308830
};
88318831

88328832
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
@@ -8850,29 +8850,38 @@ static void rope_yarn(
88508850
}
88518851

88528852
// rope == RoPE == rotary positional embedding
8853-
template<typename T, bool has_pos>
8854-
static void rope(
8855-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
8856-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
8857-
,
8853+
template<typename T, bool has_ff>
8854+
static void rope_norm(
8855+
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
8856+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
88588857
const sycl::nd_item<3> &item_ct1) {
8859-
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
8858+
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
88608859
item_ct1.get_local_id(1));
88618860

8862-
if (col >= ncols) {
8861+
if (i0 >= ne0) {
88638862
return;
88648863
}
88658864

88668865
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
88678866
item_ct1.get_local_id(2);
8868-
const int i = row*ncols + col;
8867+
8868+
if (i0 >= n_dims) {
8869+
const int i = row*ne0 + i0;
8870+
8871+
dst[i + 0] = x[i + 0];
8872+
dst[i + 1] = x[i + 1];
8873+
8874+
return;
8875+
}
8876+
8877+
const int i = row*ne0 + i0;
88698878
const int i2 = row/p_delta_rows;
88708879

8871-
const int p = has_pos ? pos[i2] : 0;
8872-
const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
8880+
const float theta_base = pos[i2]*sycl::pow(theta_scale, i0/2.0f);
8881+
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
88738882

88748883
float cos_theta, sin_theta;
8875-
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
8884+
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
88768885

88778886
const float x0 = x[i + 0];
88788887
const float x1 = x[i + 1];
@@ -8881,45 +8890,40 @@ static void rope(
88818890
dst[i + 1] = x0*sin_theta + x1*cos_theta;
88828891
}
88838892

8884-
template<typename T, bool has_pos, bool has_freq_facs>
8885-
static void rope_neox(
8886-
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
8887-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
8888-
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
8889-
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
8893+
template <typename T, bool has_ff>
8894+
static void rope_neox(const T *x, T *dst, int ne0, int n_dims,
8895+
const int32_t *pos, float freq_scale, int p_delta_rows,
8896+
float ext_factor, float attn_factor,
8897+
rope_corr_dims corr_dims, float theta_scale,
8898+
const float *freq_factors,
8899+
const sycl::nd_item<3> &item_ct1) {
8900+
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
88908901
item_ct1.get_local_id(1));
88918902

8892-
if (col >= ncols) {
8903+
if (i0 >= ne0) {
88938904
return;
88948905
}
88958906

88968907
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
88978908
item_ct1.get_local_id(2);
8898-
const int ib = col / n_dims;
8899-
const int ic = col % n_dims;
89008909

8901-
if (ib > 0) {
8902-
const int i = row*ncols + ib*n_dims + ic;
8910+
if (i0 >= n_dims) {
8911+
const int i = row*ne0 + i0;
89038912

89048913
dst[i + 0] = x[i + 0];
89058914
dst[i + 1] = x[i + 1];
89068915

89078916
return;
89088917
}
89098918

8910-
const int i = row*ncols + ib*n_dims + ic/2;
8919+
const int i = row*ne0 + i0/2;
89118920
const int i2 = row/p_delta_rows;
89128921

8913-
float cur_rot = inv_ndims * ic - ib;
8914-
8915-
const int p = has_pos ? pos[i2] : 0;
8916-
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
8917-
8918-
const float theta_base =
8919-
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
8922+
const float theta_base = pos[i2]*sycl::pow(theta_scale, i0/2.0f);
8923+
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
89208924

89218925
float cos_theta, sin_theta;
8922-
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
8926+
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
89238927

89248928
const float x0 = x[i + 0];
89258929
const float x1 = x[i + n_dims/2];
@@ -12375,15 +12379,18 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min,
1237512379
}
1237612380

1237712381
template <typename T>
12378-
static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
12382+
static void rope_norm_sycl(const T *x, T *dst, int ne0, int n_dims, int nr,
1237912383
const int32_t *pos, float freq_scale, int p_delta_rows,
1238012384
float freq_base, float ext_factor, float attn_factor,
12381-
rope_corr_dims corr_dims, dpct::queue_ptr stream) {
12382-
GGML_ASSERT(ncols % 2 == 0);
12385+
rope_corr_dims corr_dims, const float * freq_factors, dpct::queue_ptr stream) {
12386+
GGML_ASSERT(ne0 % 2 == 0);
1238312387
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
12384-
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12385-
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
12386-
if (pos == nullptr) {
12388+
const int n_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12389+
const sycl::range<3> block_nums(1, n_blocks_x, nr);
12390+
12391+
const float theta_scale = sycl::pow(freq_base, -2.0f/n_dims);
12392+
12393+
if (freq_factors == nullptr) {
1238712394
/*
1238812395
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
1238912396
the limit. To get the device limit, query
@@ -12395,8 +12402,8 @@ static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
1239512402
stream->parallel_for(
1239612403
sycl::nd_range<3>(block_nums * block_dims, block_dims),
1239712404
[=](sycl::nd_item<3> item_ct1) {
12398-
rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
12399-
freq_base, ext_factor, attn_factor, corr_dims,
12405+
rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
12406+
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
1240012407
item_ct1);
1240112408
});
1240212409
} else {
@@ -12411,70 +12418,46 @@ static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
1241112418
stream->parallel_for(
1241212419
sycl::nd_range<3>(block_nums * block_dims, block_dims),
1241312420
[=](sycl::nd_item<3> item_ct1) {
12414-
rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
12415-
freq_base, ext_factor, attn_factor, corr_dims,
12421+
rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
12422+
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
1241612423
item_ct1);
1241712424
});
1241812425
}
1241912426
}
1242012427

1242112428
template <typename T>
12422-
static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
12429+
static void rope_neox_sycl(const T *x, T *dst, int ne0, int n_dims, int nr,
1242312430
const int32_t *pos, float freq_scale,
1242412431
int p_delta_rows, float freq_base, float ext_factor,
1242512432
float attn_factor, rope_corr_dims corr_dims,
1242612433
const float * freq_factors, dpct::queue_ptr stream) {
12427-
GGML_ASSERT(ncols % 2 == 0);
12434+
GGML_ASSERT(ne0 % 2 == 0);
1242812435
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
12429-
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12430-
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
12436+
const int n_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
12437+
const sycl::range<3> block_nums(1, n_blocks_x, nr);
1243112438

12432-
const float theta_scale = powf(freq_base, -2.0f/n_dims);
12433-
const float inv_ndims = -1.0f / n_dims;
12439+
const float theta_scale = sycl::pow(freq_base, -2.0f/n_dims);
1243412440

12435-
if (pos == nullptr) {
12436-
dpct::has_capability_or_fail(stream->get_device(),
12437-
{sycl::aspect::fp16});
12438-
if (freq_factors == nullptr) {
12439-
stream->parallel_for(
12440-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12441-
[=](sycl::nd_item<3> item_ct1) {
12442-
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
12443-
p_delta_rows, ext_factor, attn_factor,
12444-
corr_dims, theta_scale, inv_ndims, freq_factors,
12445-
item_ct1);
12446-
});
12447-
} else {
12448-
stream->parallel_for(
12449-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12450-
[=](sycl::nd_item<3> item_ct1) {
12451-
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
12452-
p_delta_rows, ext_factor, attn_factor,
12453-
corr_dims, theta_scale, inv_ndims, freq_factors,
12454-
item_ct1);
12455-
});
12456-
}
12441+
dpct::has_capability_or_fail(stream->get_device(),
12442+
{sycl::aspect::fp16});
12443+
if (freq_factors == nullptr) {
12444+
stream->parallel_for(
12445+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12446+
[=](sycl::nd_item<3> item_ct1) {
12447+
rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
12448+
p_delta_rows, ext_factor, attn_factor,
12449+
corr_dims, theta_scale, freq_factors,
12450+
item_ct1);
12451+
});
1245712452
} else {
12458-
dpct::has_capability_or_fail(stream->get_device(),
12459-
{sycl::aspect::fp16});
12460-
12461-
if (freq_factors == nullptr) {
12462-
stream->parallel_for(
12463-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12464-
[=](sycl::nd_item<3> item_ct1) {
12465-
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
12466-
p_delta_rows, ext_factor, attn_factor,
12467-
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
12468-
});
12469-
} else {
12470-
stream->parallel_for(
12471-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12472-
[=](sycl::nd_item<3> item_ct1) {
12473-
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
12474-
p_delta_rows, ext_factor, attn_factor,
12475-
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
12476-
});
12477-
}
12453+
stream->parallel_for(
12454+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
12455+
[=](sycl::nd_item<3> item_ct1) {
12456+
rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
12457+
p_delta_rows, ext_factor, attn_factor,
12458+
corr_dims, theta_scale, freq_factors,
12459+
item_ct1);
12460+
});
1247812461
}
1247912462
}
1248012463

@@ -12592,8 +12575,8 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
1259212575
const uint32_t n_head_kv = nrows_x/nrows_y;
1259312576
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
1259412577

12595-
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
12596-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
12578+
const float m0 = sycl::pow(2.0f, -(max_bias ) / n_head_log2);
12579+
const float m1 = sycl::pow(2.0f, -(max_bias / 2.0f) / n_head_log2);
1259712580

1259812581
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
1259912582
if (n_local_scratch*sizeof(float) < local_mem_size) {
@@ -14005,8 +13988,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
1400513988

1400613989
const int64_t ne00 = src0->ne[0];
1400713990
const int64_t ne01 = src0->ne[1];
14008-
const int64_t ne2 = dst->ne[2];
14009-
const int64_t nrows = ggml_nrows(src0);
13991+
const int64_t nr = ggml_nrows(src0);
1401013992

1401113993
//const int n_past = ((int32_t *) dst->op_params)[0];
1401213994
const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -14023,27 +14005,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
1402314005
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1402414006
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1402514007

14026-
const float * freq_factors = nullptr;
14027-
const int32_t * pos = nullptr;
14028-
if ((mode & 1) == 0) {
14029-
GGML_ASSERT(src1->type == GGML_TYPE_I32);
14030-
GGML_ASSERT(src1->ne[0] == ne2);
14031-
pos = (const int32_t *) src1_dd;
14032-
}
14033-
1403414008
const bool is_neox = mode & 2;
1403514009

14036-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
14037-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
14038-
14039-
if (is_neox) {
14040-
pos = (const int32_t *) src1_dd;
14010+
const int32_t * pos = (const int32_t *) src1_dd;
1404114011

14042-
if (src2 != nullptr) {
14043-
freq_factors = (const float *) src2->data;
14044-
}
14045-
} else {
14046-
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
14012+
const float * freq_factors = nullptr;
14013+
if (src2 != nullptr) {
14014+
freq_factors = (const float *) src2->data;
1404714015
}
1404814016

1404914017
rope_corr_dims corr_dims;
@@ -14053,27 +14021,27 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
1405314021
if (is_neox) {
1405414022
if (src0->type == GGML_TYPE_F32) {
1405514023
rope_neox_sycl(
14056-
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14024+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
1405714025
attn_factor, corr_dims, freq_factors, main_stream
1405814026
);
1405914027
} else if (src0->type == GGML_TYPE_F16) {
1406014028
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
14061-
ne00, n_dims, nrows, pos, freq_scale, ne01,
14029+
ne00, n_dims, nr, pos, freq_scale, ne01,
1406214030
freq_base, ext_factor, attn_factor, corr_dims,
1406314031
freq_factors, main_stream);
1406414032
} else {
1406514033
GGML_ASSERT(false);
1406614034
}
1406714035
} else {
1406814036
if (src0->type == GGML_TYPE_F32) {
14069-
rope_sycl(
14070-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14071-
attn_factor, corr_dims, main_stream
14037+
rope_norm_sycl(
14038+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
14039+
attn_factor, corr_dims, freq_factors, main_stream
1407214040
);
1407314041
} else if (src0->type == GGML_TYPE_F16) {
14074-
rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
14075-
nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14076-
attn_factor, corr_dims, main_stream);
14042+
rope_norm_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
14043+
n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
14044+
attn_factor, corr_dims, freq_factors, main_stream);
1407714045
} else {
1407814046
GGML_ASSERT(false);
1407914047
}
@@ -17267,7 +17235,12 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
1726717235
case GGML_OP_CONCAT:
1726817236
{
1726917237
ggml_type src0_type = op->src[0]->type;
17270-
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
17238+
int dim = op->op_params[0];
17239+
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16 && dim == 2;
17240+
} break;
17241+
case GGML_OP_ROPE:
17242+
{
17243+
return ggml_is_contiguous(op->src[0]);
1727117244
} break;
1727217245
case GGML_OP_DUP:
1727317246
case GGML_OP_NONE:
@@ -17287,7 +17260,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
1728717260
case GGML_OP_CONT:
1728817261
case GGML_OP_DIAG_MASK_INF:
1728917262
case GGML_OP_SOFT_MAX:
17290-
case GGML_OP_ROPE:
1729117263
case GGML_OP_IM2COL:
1729217264
case GGML_OP_POOL_2D:
1729317265
case GGML_OP_SUM_ROWS:

0 commit comments

Comments
 (0)