Skip to content

Commit e151321

Browse files
CUDA: minimum size for split buffers
1 parent ec450d3 commit e151321

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

ggml/src/ggml-cuda.cu

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2978,6 +2978,22 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
29782978
{
29792979
struct ggml_tensor * a = op->src[0];
29802980
struct ggml_tensor * b = op->src[1];
2981+
// only use row split if the weight matrix is large enough for every GPU to get data (this solves some edge cases)
2982+
// also for small matrices the overhead is very large anyways so splitting is slow
2983+
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
2984+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
2985+
int64_t active_devices = 0;
2986+
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
2987+
int64_t row_low;
2988+
int64_t row_high;
2989+
get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, id);
2990+
active_devices += row_low == row_high;
2991+
}
2992+
const int64_t rounding = get_row_rounding(buft_ctx->tensor_split);
2993+
if (rounding*active_devices < a->ne[1]) {
2994+
return false;
2995+
}
2996+
}
29812997
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
29822998
return false;
29832999
}

0 commit comments

Comments
 (0)