@@ -78,10 +78,6 @@ enum e_model {
78
78
static const size_t kB = 1024 ;
79
79
static const size_t MB = 1024 *1024 ;
80
80
81
- // computed for n_ctx == 2048
82
- // TODO: dynamically determine these sizes
83
- // needs modifications in ggml
84
-
85
81
typedef void (*offload_func_t )(struct ggml_tensor * tensor);
86
82
87
83
void llama_nop (struct ggml_tensor * tensor) { // don't offload by default
@@ -103,39 +99,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
103
99
ggml_graph_compute (graph, &plan);
104
100
}
105
101
106
-
107
- // TODO: remove, temporary for comparison with graph allocator
108
-
109
- // amount of VRAM needed per batch size to hold temporary results
110
- // the values for 3b are not derived from testing but instead chosen conservatively
111
- static const std::map<e_model, size_t > & VRAM_REQ_SCRATCH_BASE ()
112
- {
113
- static std::map<e_model, size_t > k_sizes = {
114
- { MODEL_3B, 512ull * kB },
115
- { MODEL_7B, 512ull * kB },
116
- { MODEL_13B, 640ull * kB },
117
- { MODEL_30B, 768ull * kB },
118
- { MODEL_65B, 1280ull * kB },
119
- { MODEL_70B, 1280ull * kB },
120
- };
121
- return k_sizes;
122
- }
123
-
124
- // amount of VRAM needed per batch size and context to hold temporary results
125
- // the values for 3b are not derived from testing but instead chosen conservatively
126
- static const std::map<e_model, size_t > & VRAM_REQ_SCRATCH_PER_CONTEXT ()
127
- {
128
- static std::map<e_model, size_t > k_sizes = {
129
- { MODEL_3B, 128ull },
130
- { MODEL_7B, 128ull },
131
- { MODEL_13B, 160ull },
132
- { MODEL_30B, 208ull },
133
- { MODEL_65B, 256ull },
134
- { MODEL_70B, 256ull },
135
- };
136
- return k_sizes;
137
- }
138
-
139
102
// default hparams (LLaMA 7B)
140
103
struct llama_hparams {
141
104
uint32_t n_vocab = 32000 ;
@@ -1101,7 +1064,6 @@ static void llama_model_load_internal(
1101
1064
1102
1065
// prepare memory for the weights
1103
1066
size_t vram_weights = 0 ;
1104
- size_t vram_scratch = 0 ;
1105
1067
{
1106
1068
const uint32_t n_embd = hparams.n_embd ;
1107
1069
const uint32_t n_embd_gqa = hparams.n_embd_gqa ();
@@ -1192,24 +1154,7 @@ static void llama_model_load_internal(
1192
1154
LLAMA_LOG_INFO (" %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
1193
1155
mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
1194
1156
1195
- (void ) vram_scratch;
1196
1157
(void ) n_batch;
1197
- #ifdef GGML_USE_CUBLAS
1198
- if (low_vram) {
1199
- LLAMA_LOG_INFO (" %s: (debug) not allocating a VRAM scratch buffer due to low VRAM option\n " , __func__);
1200
- ggml_cuda_set_scratch_size (0 ); // disable scratch
1201
- } else {
1202
- const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE ().at (model.type );
1203
- const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT ().at (model.type );
1204
- vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1205
- // ggml_cuda_set_scratch_size(vram_scratch);
1206
- if (n_gpu_layers > 0 ) {
1207
- LLAMA_LOG_INFO (" %s: (debug) not allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n " ,
1208
- __func__, vram_scratch_base / kB , vram_scratch_per_context,
1209
- (vram_scratch + MB - 1 ) / MB); // round up
1210
- }
1211
- }
1212
- #endif // GGML_USE_CUBLAS
1213
1158
1214
1159
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1215
1160
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
@@ -1246,8 +1191,8 @@ static void llama_model_load_internal(
1246
1191
1247
1192
LLAMA_LOG_INFO (" %s: offloaded %d/%d layers to GPU\n " ,
1248
1193
__func__, std::min (n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1249
- LLAMA_LOG_INFO (" %s: total VRAM used: %zu MB\n " ,
1250
- __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1 ) / MB); // round up
1194
+ LLAMA_LOG_INFO (" %s: VRAM used: %zu MB\n " ,
1195
+ __func__, (vram_weights + vram_kv_cache + MB - 1 ) / MB); // round up
1251
1196
#else
1252
1197
(void ) n_gpu_layers;
1253
1198
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3314,7 +3259,7 @@ struct llama_context * llama_new_context_with_model(
3314
3259
ggml_cuda_set_scratch_size (0 ); // disable scratch
3315
3260
} else {
3316
3261
ggml_cuda_set_scratch_size (alloc_size);
3317
- LLAMA_LOG_INFO (" %s: allocating %.2f MB VRAM for the scratch buffer \n " , __func__, alloc_size / 1024.0 / 1024.0 );
3262
+ LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MB\n " , __func__, alloc_size / 1024.0 / 1024.0 );
3318
3263
}
3319
3264
#endif
3320
3265
}
0 commit comments