Skip to content

Commit 15b1937

Browse files
authored
Offloading tensors based on total VRAM budget and offloading policy (ggml-org#6)
* deprecate ffn_b * get tensor offloading levels * wip: split tensor loading * wip: framework of loading sparse model tensors * save and flush gpu alloc buffer * vram budget will fall back to remaining free memory * minor: remove vram safety margin * add options for vram budget; clean old env vars * minor: bugfix
1 parent b89a0b7 commit 15b1937

File tree

6 files changed

+417
-119
lines changed

6 files changed

+417
-119
lines changed

common/common.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
565565
#else
566566
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
567567
#endif // GGML_USE_CUBLAS
568+
} else if (arg == "--vram-budget") {
569+
if (++i >= argc) {
570+
invalid_param = true;
571+
break;
572+
}
573+
#ifdef GGML_USE_CUBLAS
574+
params.vram_budget_gb = std::stof(argv[i]);
575+
#else
576+
fprintf(stderr, "warning: PowerInfer was compiled without cuBLAS. It is not possible to set a VRAM budget.\n");
577+
#endif
568578
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
569579
#ifdef GGML_USE_CUBLAS
570580
params.mul_mat_q = false;
@@ -801,6 +811,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
801811
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
802812
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
803813
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
814+
printf(" --vram-budget N VRAM budget in GiB (default: -1, -1 = available VRAM)\n");
804815
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
805816
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
806817
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -895,6 +906,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
895906
mparams.n_gpu_layers = params.n_gpu_layers;
896907
}
897908
mparams.main_gpu = params.main_gpu;
909+
mparams.vram_budget_gb = params.vram_budget_gb;
898910
mparams.tensor_split = params.tensor_split;
899911
mparams.use_mmap = params.use_mmap;
900912
mparams.use_mlock = params.use_mlock;
@@ -1402,4 +1414,5 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
14021414
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
14031415
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
14041416
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
1417+
fprintf(stream, "vram_budget: %f # default: -1.0 (all available VRAM)\n", params.vram_budget_gb);
14051418
}

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ struct gpt_params {
6464
int32_t n_beams = 0; // if non-zero then use beam search of given width.
6565
float rope_freq_base = 0.0f; // RoPE base frequency
6666
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
67+
float vram_budget_gb = -1.0f; // VRAM budget in GB (-1 - use available VRAM)
6768
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
6869
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
6970
float yarn_beta_fast = 32.0f; // YaRN low correction dim

ggml-cuda.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9338,6 +9338,13 @@ int ggml_cuda_get_device_count() {
93389338
return device_count;
93399339
}
93409340

9341+
size_t ggml_cuda_get_free_memory(int device) {
9342+
size_t free, total;
9343+
CUDA_CHECK(cudaSetDevice(device));
9344+
CUDA_CHECK(cudaMemGetInfo(&free, &total));
9345+
return free;
9346+
}
9347+
93419348
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
93429349
cudaDeviceProp prop;
93439350
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
@@ -9610,3 +9617,4 @@ ggml_backend_t ggml_backend_cuda_init() {
96109617

96119618
return cuda_backend;
96129619
}
9620+

ggml-cuda.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
5151

5252
GGML_API int ggml_cuda_get_device_count(void);
5353
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
54+
GGML_API size_t ggml_cuda_get_free_memory(int device);
5455

5556
// backend API
5657
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use

0 commit comments

Comments
 (0)