@@ -3886,13 +3886,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3886
3886
// rope == RoPE == rotary positional embedding
3887
3887
static __global__ void rope_f32 (const float * x, float * dst, const int ncols, const float p0,
3888
3888
const float p_delta, const int p_delta_rows, const float theta_scale) {
3889
- const int col = 2 *(blockDim .x *blockIdx .x + threadIdx .x );
3889
+ const int col = 2 *(blockDim .y *blockIdx .y + threadIdx .y );
3890
3890
3891
3891
if (col >= ncols) {
3892
3892
return ;
3893
3893
}
3894
3894
3895
- const int row = blockDim .y *blockIdx .y + threadIdx .y ;
3895
+ const int row = blockDim .x *blockIdx .x + threadIdx .x ;
3896
3896
const int i = row*ncols + col;
3897
3897
3898
3898
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf (theta_scale, col/2 );
@@ -3941,8 +3941,8 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3941
3941
}
3942
3942
3943
3943
static __global__ void diag_mask_inf_f32 (const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
3944
- const int col = blockDim .x *blockIdx .x + threadIdx .x ;
3945
- const int row = blockDim .y *blockIdx .y + threadIdx .y ;
3944
+ const int col = blockDim .y *blockIdx .y + threadIdx .y ;
3945
+ const int row = blockDim .x *blockIdx .x + threadIdx .x ;
3946
3946
3947
3947
if (col >= ncols) {
3948
3948
return ;
@@ -3958,9 +3958,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3958
3958
// values are also not normalized to the maximum value by subtracting it in the exponential function
3959
3959
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3960
3960
static __global__ void soft_max_f32 (const float * x, float * dst, const int ncols) {
3961
- const int row = blockDim .y *blockIdx .y + threadIdx .y ;
3962
- const int block_size = blockDim .x ;
3963
- const int tid = threadIdx .x ;
3961
+ const int row = blockDim .x *blockIdx .x + threadIdx .x ;
3962
+ const int block_size = blockDim .y ;
3963
+ const int tid = threadIdx .y ;
3964
3964
3965
3965
float tmp = 0.0 ;
3966
3966
@@ -4752,9 +4752,9 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4752
4752
static void rope_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, const float p0,
4753
4753
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4754
4754
GGML_ASSERT (nrows % 2 == 0 );
4755
- const dim3 block_dims (2 *CUDA_ROPE_BLOCK_SIZE, 1 , 1 );
4755
+ const dim3 block_dims (1 , 2 *CUDA_ROPE_BLOCK_SIZE, 1 );
4756
4756
const int num_blocks_x = (ncols + 2 *CUDA_ROPE_BLOCK_SIZE - 1 ) / (2 *CUDA_ROPE_BLOCK_SIZE);
4757
- const dim3 block_nums (num_blocks_x, nrows , 1 );
4757
+ const dim3 block_nums (nrows, num_blocks_x , 1 );
4758
4758
rope_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4759
4759
}
4760
4760
@@ -4767,15 +4767,15 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4767
4767
}
4768
4768
4769
4769
static void diag_mask_inf_f32_cuda (const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4770
- const dim3 block_dims (CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1 , 1 );
4770
+ const dim3 block_dims (1 , CUDA_DIAG_MASK_INF_BLOCK_SIZE , 1 );
4771
4771
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1 ) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4772
- const dim3 block_nums (block_num_x, nrows_x , 1 );
4772
+ const dim3 block_nums (nrows_x, block_num_x , 1 );
4773
4773
diag_mask_inf_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols_x, rows_per_channel, n_past);
4774
4774
}
4775
4775
4776
4776
static void soft_max_f32_cuda (const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4777
- const dim3 block_dims (WARP_SIZE, 1 , 1 );
4778
- const dim3 block_nums (1 , nrows_x , 1 );
4777
+ const dim3 block_dims (1 , WARP_SIZE , 1 );
4778
+ const dim3 block_nums (nrows_x, 1 , 1 );
4779
4779
soft_max_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols_x);
4780
4780
}
4781
4781
@@ -6240,7 +6240,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6240
6240
return extra;
6241
6241
}
6242
6242
6243
- void ggml_cuda_assign_buffers_impl (struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6243
+ void ggml_cuda_assign_buffers_impl (struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc ) {
6244
6244
if (scratch && g_scratch_size == 0 ) {
6245
6245
return ;
6246
6246
}
@@ -6249,14 +6249,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6249
6249
if (tensor->src [0 ] != nullptr && tensor->src [0 ]->backend == GGML_BACKEND_CPU) {
6250
6250
const ggml_op src0_op = tensor->src [0 ]->op ;
6251
6251
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6252
- ggml_cuda_assign_buffers_impl (tensor->src [0 ], scratch, force_inplace);
6252
+ ggml_cuda_assign_buffers_impl (tensor->src [0 ], scratch, force_inplace, no_alloc );
6253
6253
}
6254
6254
}
6255
6255
if (tensor->op == GGML_OP_CPY && tensor->src [1 ]->backend == GGML_BACKEND_CPU) {
6256
- ggml_cuda_assign_buffers_impl (tensor->src [1 ], scratch, force_inplace);
6256
+ ggml_cuda_assign_buffers_impl (tensor->src [1 ], scratch, force_inplace, no_alloc );
6257
6257
}
6258
6258
6259
6259
tensor->backend = GGML_BACKEND_GPU;
6260
+
6261
+ if (scratch && no_alloc) {
6262
+ return ;
6263
+ }
6264
+
6260
6265
struct ggml_tensor_extra_gpu * extra;
6261
6266
6262
6267
const bool inplace = (tensor->src [0 ] != nullptr && tensor->src [0 ]->data == tensor->data ) ||
@@ -6308,16 +6313,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6308
6313
tensor->extra = extra;
6309
6314
}
6310
6315
6316
+ void ggml_cuda_assign_scratch_offset (struct ggml_tensor * tensor, size_t offset) {
6317
+ if (g_scratch_size == 0 ) {
6318
+ return ;
6319
+ }
6320
+ if (g_scratch_buffer == nullptr ) {
6321
+ CUDA_CHECK (cudaMalloc (&g_scratch_buffer, g_scratch_size));
6322
+ }
6323
+
6324
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra ();
6325
+
6326
+ const bool inplace = (tensor->src [0 ] != nullptr && tensor->src [0 ]->data == tensor->data ) ||
6327
+ tensor->op == GGML_OP_VIEW;
6328
+
6329
+ if (inplace && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT)) {
6330
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [0 ]->extra ;
6331
+ char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
6332
+ size_t view_offset = 0 ;
6333
+ if (tensor->op == GGML_OP_VIEW) {
6334
+ memcpy (&view_offset, tensor->op_params , sizeof (size_t ));
6335
+ }
6336
+ extra->data_device [g_main_device] = src0_ddc + view_offset;
6337
+ } else {
6338
+ extra->data_device [g_main_device] = (char *) g_scratch_buffer + offset;
6339
+ }
6340
+
6341
+ tensor->extra = extra;
6342
+ }
6343
+
6311
6344
void ggml_cuda_assign_buffers (struct ggml_tensor * tensor) {
6312
- ggml_cuda_assign_buffers_impl (tensor, true , false );
6345
+ ggml_cuda_assign_buffers_impl (tensor, true , false , false );
6346
+ }
6347
+
6348
+ void ggml_cuda_assign_buffers_no_alloc (struct ggml_tensor * tensor) {
6349
+ ggml_cuda_assign_buffers_impl (tensor, true , false , true );
6313
6350
}
6314
6351
6315
6352
void ggml_cuda_assign_buffers_no_scratch (struct ggml_tensor * tensor) {
6316
- ggml_cuda_assign_buffers_impl (tensor, false , false );
6353
+ ggml_cuda_assign_buffers_impl (tensor, false , false , false );
6317
6354
}
6318
6355
6319
6356
void ggml_cuda_assign_buffers_force_inplace (struct ggml_tensor * tensor) {
6320
- ggml_cuda_assign_buffers_impl (tensor, false , true );
6357
+ ggml_cuda_assign_buffers_impl (tensor, false , true , false );
6321
6358
}
6322
6359
6323
6360
void ggml_cuda_set_main_device (int main_device) {
0 commit comments