@@ -1051,7 +1051,8 @@ static void llama_model_load_internal(
1051
1051
#endif
1052
1052
1053
1053
// prepare memory for the weights
1054
- size_t vram_total = 0 ;
1054
+ size_t vram_weights = 0 ;
1055
+ size_t vram_scratch = 0 ;
1055
1056
{
1056
1057
const uint32_t n_embd = hparams.n_embd ;
1057
1058
const uint32_t n_layer = hparams.n_layer ;
@@ -1099,7 +1100,7 @@ static void llama_model_load_internal(
1099
1100
layer.w3 = ml->get_tensor (layers_i + " .feed_forward.w3.weight" , {n_embd, n_ff}, backend_split);
1100
1101
1101
1102
if (backend == GGML_BACKEND_GPU) {
1102
- vram_total +=
1103
+ vram_weights +=
1103
1104
ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1104
1105
ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm ) +
1105
1106
ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
@@ -1116,7 +1117,7 @@ static void llama_model_load_internal(
1116
1117
// this is the total memory required to run the inference
1117
1118
const size_t mem_required =
1118
1119
ctx_size +
1119
- mmapped_size - vram_total + // weights in VRAM not in memory
1120
+ mmapped_size - vram_weights + // weights in VRAM not in memory
1120
1121
MEM_REQ_SCRATCH0 ().at (model.type ) +
1121
1122
MEM_REQ_SCRATCH1 ().at (model.type ) +
1122
1123
MEM_REQ_EVAL ().at (model.type );
@@ -1130,12 +1131,21 @@ static void llama_model_load_internal(
1130
1131
1131
1132
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1132
1133
1134
+ #ifdef GGML_USE_CUBLAS
1135
+ vram_scratch = n_batch * MB;
1136
+ ggml_cuda_set_scratch_size (vram_scratch);
1137
+ if (n_gpu_layers > 0 ) {
1138
+ fprintf (stderr, " %s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n " ,
1139
+ __func__, vram_scratch / MB);
1140
+ }
1141
+ #endif // GGML_USE_CUBLAS
1133
1142
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1134
1143
fprintf (stderr, " %s: offloading %d layers to GPU\n " , __func__, n_gpu);
1135
1144
if (n_gpu_layers > (int ) hparams.n_layer ) {
1136
1145
fprintf (stderr, " %s: offloading output layer to GPU\n " , __func__);
1137
1146
}
1138
- fprintf (stderr, " %s: total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1147
+ fprintf (stderr, " %s: total VRAM used: %zu MB\n " ,
1148
+ __func__, (vram_weights + vram_scratch + MB - 1 ) / MB); // round up
1139
1149
#else
1140
1150
(void ) n_gpu_layers;
1141
1151
#endif
@@ -1150,7 +1160,6 @@ static void llama_model_load_internal(
1150
1160
1151
1161
#if defined(GGML_USE_CUBLAS)
1152
1162
{
1153
- ggml_cuda_set_n_batch (n_batch);
1154
1163
ggml_cuda_set_tensor_split (tensor_split);
1155
1164
1156
1165
size_t done_size = 0 ;
0 commit comments