Skip to content

Commit 2e93643

Browse files
CUDA: fix crash with partial offloading of MoE
1 parent 15e6125 commit 2e93643

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,13 +1909,18 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
19091909
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
19101910
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
19111911

1912+
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
1913+
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
1914+
// Therefore, in such cases use cuBLAS.
1915+
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src;
1916+
19121917
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
19131918
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19141919
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1915-
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1920+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19161921
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19171922
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1918-
bool use_mul_mat_q = ggml_is_quantized(src0->type)
1923+
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19191924
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19201925

19211926
bool any_gpus_with_slow_fp16 = false;

0 commit comments

Comments
 (0)