Skip to content

Commit 59aa825

Browse files
add scratch VRAM usage to info print
1 parent 927fac3 commit 59aa825

File tree

3 files changed

+22
-17
lines changed

3 files changed

+22
-17
lines changed

ggml-cuda.cu

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -535,11 +535,8 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
535535
}
536536

537537

538-
#define GGML_CUDA_SCRATCH_SIZE_PER_BATCH 1048576 // 1 MB
539-
//#define GGML_CUDA_SCRATCH_SIZE_PER_BATCH 2097152 // 2 MB
540-
541538
static void * g_scratch_buffer = nullptr;
542-
static int g_n_batch = 512;
539+
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
543540
static size_t g_scratch_offset = 0;
544541

545542
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
@@ -1292,9 +1289,8 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
12921289
}
12931290

12941291
const size_t size = ggml_nbytes(tensor);
1295-
const size_t scratch_size = g_n_batch * GGML_CUDA_SCRATCH_SIZE_PER_BATCH;
1296-
GGML_ASSERT(size <= scratch_size);
1297-
if (g_scratch_offset + size > scratch_size) {
1292+
GGML_ASSERT(size <= g_scratch_size);
1293+
if (g_scratch_offset + size > g_scratch_size) {
12981294
g_scratch_offset = 0;
12991295
}
13001296

@@ -1311,7 +1307,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
13111307
} else {
13121308
char * data = (char *) g_scratch_buffer;
13131309
if (data == nullptr) {
1314-
CUDA_CHECK(cudaMalloc(&data, scratch_size));
1310+
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
13151311
g_scratch_buffer = data;
13161312
}
13171313
extra->data_device[g_main_device] = data + g_scratch_offset;
@@ -1322,12 +1318,12 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
13221318
// fprintf(stderr, "%s: scratch %d, %p - %p\n",
13231319
// tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
13241320

1325-
GGML_ASSERT(g_scratch_offset <= scratch_size);
1321+
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
13261322
tensor->extra = extra;
13271323
}
13281324

1329-
void ggml_cuda_set_n_batch(int n_batch) {
1330-
g_n_batch = n_batch;
1325+
void ggml_cuda_set_scratch_size(size_t scratch_size) {
1326+
g_scratch_size = scratch_size;
13311327
}
13321328

13331329
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){

ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void ggml_cuda_host_free(void * ptr);
2727
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
2828
void ggml_cuda_free_data(struct ggml_tensor * tensor);
2929
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
30-
void ggml_cuda_set_n_batch(int n_batch);
30+
void ggml_cuda_set_scratch_size(size_t scratch_size);
3131
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
3232

3333
#ifdef __cplusplus

llama.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,8 @@ static void llama_model_load_internal(
10511051
#endif
10521052

10531053
// prepare memory for the weights
1054-
size_t vram_total = 0;
1054+
size_t vram_weights = 0;
1055+
size_t vram_scratch = 0;
10551056
{
10561057
const uint32_t n_embd = hparams.n_embd;
10571058
const uint32_t n_layer = hparams.n_layer;
@@ -1099,7 +1100,7 @@ static void llama_model_load_internal(
10991100
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
11001101

11011102
if (backend == GGML_BACKEND_GPU) {
1102-
vram_total +=
1103+
vram_weights +=
11031104
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
11041105
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
11051106
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
@@ -1116,7 +1117,7 @@ static void llama_model_load_internal(
11161117
// this is the total memory required to run the inference
11171118
const size_t mem_required =
11181119
ctx_size +
1119-
mmapped_size - vram_total + // weights in VRAM not in memory
1120+
mmapped_size - vram_weights + // weights in VRAM not in memory
11201121
MEM_REQ_SCRATCH0().at(model.type) +
11211122
MEM_REQ_SCRATCH1().at(model.type) +
11221123
MEM_REQ_EVAL().at (model.type);
@@ -1130,12 +1131,21 @@ static void llama_model_load_internal(
11301131

11311132
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
11321133

1134+
#ifdef GGML_USE_CUBLAS
1135+
vram_scratch = n_batch * MB;
1136+
ggml_cuda_set_scratch_size(vram_scratch);
1137+
if (n_gpu_layers > 0) {
1138+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1139+
__func__, vram_scratch / MB);
1140+
}
1141+
#endif // GGML_USE_CUBLAS
11331142
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11341143
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
11351144
if (n_gpu_layers > (int) hparams.n_layer) {
11361145
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
11371146
}
1138-
fprintf(stderr, "%s: total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1147+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1148+
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
11391149
#else
11401150
(void) n_gpu_layers;
11411151
#endif
@@ -1150,7 +1160,6 @@ static void llama_model_load_internal(
11501160

11511161
#if defined(GGML_USE_CUBLAS)
11521162
{
1153-
ggml_cuda_set_n_batch(n_batch);
11541163
ggml_cuda_set_tensor_split(tensor_split);
11551164

11561165
size_t done_size = 0;

0 commit comments

Comments
 (0)