Skip to content

Commit ae3b1ab

Browse files
fix low compute capability prompt processing
1 parent 82b34d8 commit ae3b1ab

File tree

1 file changed

+14
-11
lines changed

1 file changed

+14
-11
lines changed

ggml-cuda.cu

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6539,26 +6539,29 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
65396539
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
65406540
const bool src0_is_quantized = ggml_is_quantized(src0->type);
65416541

6542-
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6542+
int min_compute_capability = INT_MAX;
6543+
for (int id = 0; id < g_device_count; ++id) {
6544+
if (min_compute_capability > g_compute_capabilities[id]
6545+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6546+
min_compute_capability = g_compute_capabilities[id];
6547+
}
6548+
}
6549+
6550+
// no quantized non-contiguous support for lower CC kernels implemented
6551+
const bool nc_okay = src0->type == GGML_TYPE_F16 || g_compute_capabilities[g_main_device] >= MIN_CC_DP4A;
6552+
6553+
if (all_on_device && nc_okay && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
65436554
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6544-
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6555+
} else if (all_on_device && nc_okay && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
65456556
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
65466557
}else if (src0->type == GGML_TYPE_F32) {
65476558
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
65486559
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
65496560
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
65506561
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec<true>, false, false);
65516562
} else {
6552-
int min_compute_capability = INT_MAX;
6553-
for (int id = 0; id < g_device_count; ++id) {
6554-
if (min_compute_capability > g_compute_capabilities[id]
6555-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6556-
min_compute_capability = g_compute_capabilities[id];
6557-
}
6558-
}
6559-
65606563
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6561-
if (all_on_device && src0->backend != GGML_BACKEND_GPU_SPLIT) {
6564+
if (all_on_device && nc_okay && src0->backend != GGML_BACKEND_GPU_SPLIT) {
65626565
ggml_cuda_mul_mat_nc(src0, src1, dst);
65636566
} else {
65646567
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q<true>, false, false);

0 commit comments

Comments
 (0)