Offloading tensors based on total VRAM budget and offloading policy (ggml-org#6)

hodlen · web-flow · commit 15b193729bea · 2023-12-15T23:46:51.000+08:00
* deprecate ffn_b

* get tensor offloading levels

* wip: split tensor loading

* wip: framework of loading sparse model tensors

* save and flush gpu alloc buffer

* vram budget will fall back to remaining free memory

* minor: remove vram safety margin

* add options for vram budget; clean old env vars

* minor: bugfix
diff --git a/common/common.cpp b/common/common.cpp
@@ -565,6 +565,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 #else
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
+        } else if (arg == "--vram-budget") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            params.vram_budget_gb = std::stof(argv[i]);
+#else
+            fprintf(stderr, "warning: PowerInfer was compiled without cuBLAS. It is not possible to set a VRAM budget.\n");
+#endif
         } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #ifdef GGML_USE_CUBLAS
             params.mul_mat_q = false;
@@ -801,6 +811,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
+    printf("  --vram-budget N       VRAM budget in GiB (default: -1, -1 = available VRAM)\n");
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -895,6 +906,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
     mparams.main_gpu        = params.main_gpu;
+    mparams.vram_budget_gb  = params.vram_budget_gb;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
@@ -1402,4 +1414,5 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
     fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
+    fprintf(stream, "vram_budget: %f # default: -1.0 (all available VRAM)\n", params.vram_budget_gb);
 }
diff --git a/common/common.h b/common/common.h
@@ -64,6 +64,7 @@ struct gpt_params {
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f;  // RoPE base frequency
     float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
+    float   vram_budget_gb                  = -1.0f; // VRAM budget in GB (-1 - use available VRAM)
     float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
     float   yarn_attn_factor                = 1.0f;  // YaRN magnitude scaling factor
     float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -9338,6 +9338,13 @@ int ggml_cuda_get_device_count() {
     return device_count;
 }
 
+size_t ggml_cuda_get_free_memory(int device) {
+    size_t free, total;
+    CUDA_CHECK(cudaSetDevice(device));
+    CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    return free;
+}
+
 void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
     cudaDeviceProp prop;
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
@@ -9610,3 +9617,4 @@ ggml_backend_t ggml_backend_cuda_init() {
 
     return cuda_backend;
 }
+
diff --git a/ggml-cuda.h b/ggml-cuda.h
@@ -51,6 +51,7 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API size_t ggml_cuda_get_free_memory(int device);
 
 // backend API
 GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h