@@ -4774,8 +4774,8 @@ static __global__ void rope(
4774
4774
4775
4775
template <typename T, bool has_pos>
4776
4776
static __global__ void rope_neox (
4777
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base ,
4778
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4777
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4778
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4779
4779
) {
4780
4780
const int col = 2 *(blockDim .y *blockIdx .y + threadIdx .y );
4781
4781
@@ -4784,23 +4784,25 @@ static __global__ void rope_neox(
4784
4784
}
4785
4785
4786
4786
const int row = blockDim .x *blockIdx .x + threadIdx .x ;
4787
- const int i = row*ncols + col/2 ;
4787
+ const int ib = col / n_dims;
4788
+ const int ic = col % n_dims;
4789
+
4790
+ const int i = row*ncols + ib*n_dims + ic/2 ;
4788
4791
const int i2 = row/p_delta_rows;
4789
4792
4790
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4791
- const float cur_rot = -float (col)/ncols;
4793
+ float cur_rot = inv_ndims * ic - ib;
4792
4794
4793
4795
const int p = has_pos ? pos[i2] : 0 ;
4794
- const float theta_base = p*powf (freq_base, cur_rot );
4796
+ const float theta_base = p*freq_scale* powf (theta_scale, col/ 2 . 0f );
4795
4797
4796
4798
float cos_theta, sin_theta;
4797
4799
rope_yarn (theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4798
4800
4799
4801
const float x0 = x[i + 0 ];
4800
- const float x1 = x[i + ncols /2 ];
4802
+ const float x1 = x[i + n_dims /2 ];
4801
4803
4802
- dst[i + 0 ] = x0*cos_theta - x1*sin_theta;
4803
- dst[i + ncols /2 ] = x0*sin_theta + x1*cos_theta;
4804
+ dst[i + 0 ] = x0*cos_theta - x1*sin_theta;
4805
+ dst[i + n_dims /2 ] = x0*sin_theta + x1*cos_theta;
4804
4806
}
4805
4807
4806
4808
static __global__ void rope_glm_f32 (
@@ -6085,20 +6087,26 @@ static void rope_cuda(
6085
6087
6086
6088
template <typename T>
6087
6089
static void rope_neox_cuda (
6088
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
6090
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
6089
6091
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
6090
6092
) {
6091
6093
GGML_ASSERT (ncols % 2 == 0 );
6092
6094
const dim3 block_dims (1 , CUDA_ROPE_BLOCK_SIZE, 1 );
6093
6095
const int num_blocks_x = (ncols + 2 *CUDA_ROPE_BLOCK_SIZE - 1 ) / (2 *CUDA_ROPE_BLOCK_SIZE);
6094
6096
const dim3 block_nums (nrows, num_blocks_x, 1 );
6097
+
6098
+ const float theta_scale = powf (freq_base, -2 .0f /n_dims);
6099
+ const float inv_ndims = -1 .0f / n_dims;
6100
+
6095
6101
if (pos == nullptr ) {
6096
6102
rope_neox<T, false ><<<block_nums, block_dims, 0 , stream>>> (
6097
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
6103
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
6104
+ theta_scale, inv_ndims
6098
6105
);
6099
6106
} else {
6100
6107
rope_neox<T, true ><<<block_nums, block_dims, 0 , stream>>> (
6101
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
6108
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
6109
+ theta_scale, inv_ndims
6102
6110
);
6103
6111
}
6104
6112
}
@@ -7039,12 +7047,12 @@ inline void ggml_cuda_op_rope(
7039
7047
GGML_ASSERT (ne00 == n_dims && " ne00 != n_dims is not implemented for CUDA yet" );
7040
7048
if (src0->type == GGML_TYPE_F32) {
7041
7049
rope_neox_cuda (
7042
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7050
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7043
7051
attn_factor, corr_dims, main_stream
7044
7052
);
7045
7053
} else if (src0->type == GGML_TYPE_F16) {
7046
7054
rope_neox_cuda (
7047
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7055
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7048
7056
attn_factor, corr_dims, main_stream
7049
7057
);
7050
7058
} else {
0 commit comments