@@ -4908,16 +4908,17 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4908
4908
4909
4909
static void rope_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
4910
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
- GGML_ASSERT (nrows % 2 == 0 ); // GG: is this assert really needed? I don't see why
4912
- const dim3 block_dims (1 , 2 * CUDA_ROPE_BLOCK_SIZE, 1 );
4911
+ GGML_ASSERT (ncols % 2 == 0 );
4912
+ const dim3 block_dims (1 , CUDA_ROPE_BLOCK_SIZE, 1 );
4913
4913
const int num_blocks_x = (ncols + 2 *CUDA_ROPE_BLOCK_SIZE - 1 ) / (2 *CUDA_ROPE_BLOCK_SIZE);
4914
4914
const dim3 block_nums (nrows, num_blocks_x, 1 );
4915
4915
rope_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4916
4916
}
4917
4917
4918
4918
static void rope_neox_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, const float p0,
4919
4919
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4920
- const dim3 block_dims (1 , 2 *CUDA_ROPE_BLOCK_SIZE, 1 );
4920
+ GGML_ASSERT (ncols % 2 == 0 );
4921
+ const dim3 block_dims (1 , CUDA_ROPE_BLOCK_SIZE, 1 );
4921
4922
const int num_blocks_x = (ncols + 2 *CUDA_ROPE_BLOCK_SIZE - 1 ) / (2 *CUDA_ROPE_BLOCK_SIZE);
4922
4923
const dim3 block_nums (nrows, num_blocks_x, 1 );
4923
4924
rope_neox_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
0 commit comments