Skip to content

Commit 0fc72f3

Browse files
committed
cleanup
1 parent 2bd1b02 commit 0fc72f3

File tree

2 files changed

+6
-61
lines changed

2 files changed

+6
-61
lines changed

ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6329,11 +6329,11 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
63296329
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
63306330
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
63316331
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6332-
size_t offset = 0;
6332+
size_t view_offset = 0;
63336333
if (tensor->op == GGML_OP_VIEW) {
6334-
memcpy(&offset, tensor->op_params, sizeof(size_t));
6334+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
63356335
}
6336-
extra->data_device[g_main_device] = src0_ddc + offset;
6336+
extra->data_device[g_main_device] = src0_ddc + view_offset;
63376337
} else {
63386338
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
63396339
}

llama.cpp

Lines changed: 3 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,6 @@ enum e_model {
7878
static const size_t kB = 1024;
7979
static const size_t MB = 1024*1024;
8080

81-
// computed for n_ctx == 2048
82-
// TODO: dynamically determine these sizes
83-
// needs modifications in ggml
84-
8581
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
8682

8783
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
@@ -103,39 +99,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
10399
ggml_graph_compute(graph, &plan);
104100
}
105101

106-
107-
// TODO: remove, temporary for comparison with graph allocator
108-
109-
// amount of VRAM needed per batch size to hold temporary results
110-
// the values for 3b are not derived from testing but instead chosen conservatively
111-
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
112-
{
113-
static std::map<e_model, size_t> k_sizes = {
114-
{ MODEL_3B, 512ull * kB },
115-
{ MODEL_7B, 512ull * kB },
116-
{ MODEL_13B, 640ull * kB },
117-
{ MODEL_30B, 768ull * kB },
118-
{ MODEL_65B, 1280ull * kB },
119-
{ MODEL_70B, 1280ull * kB },
120-
};
121-
return k_sizes;
122-
}
123-
124-
// amount of VRAM needed per batch size and context to hold temporary results
125-
// the values for 3b are not derived from testing but instead chosen conservatively
126-
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
127-
{
128-
static std::map<e_model, size_t> k_sizes = {
129-
{ MODEL_3B, 128ull },
130-
{ MODEL_7B, 128ull },
131-
{ MODEL_13B, 160ull },
132-
{ MODEL_30B, 208ull },
133-
{ MODEL_65B, 256ull },
134-
{ MODEL_70B, 256ull },
135-
};
136-
return k_sizes;
137-
}
138-
139102
// default hparams (LLaMA 7B)
140103
struct llama_hparams {
141104
uint32_t n_vocab = 32000;
@@ -1101,7 +1064,6 @@ static void llama_model_load_internal(
11011064

11021065
// prepare memory for the weights
11031066
size_t vram_weights = 0;
1104-
size_t vram_scratch = 0;
11051067
{
11061068
const uint32_t n_embd = hparams.n_embd;
11071069
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
@@ -1192,24 +1154,7 @@ static void llama_model_load_internal(
11921154
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
11931155
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
11941156

1195-
(void) vram_scratch;
11961157
(void) n_batch;
1197-
#ifdef GGML_USE_CUBLAS
1198-
if (low_vram) {
1199-
LLAMA_LOG_INFO("%s: (debug) not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1200-
ggml_cuda_set_scratch_size(0); // disable scratch
1201-
} else {
1202-
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1203-
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1204-
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1205-
//ggml_cuda_set_scratch_size(vram_scratch);
1206-
if (n_gpu_layers > 0) {
1207-
LLAMA_LOG_INFO("%s: (debug) not allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1208-
__func__, vram_scratch_base / kB, vram_scratch_per_context,
1209-
(vram_scratch + MB - 1) / MB); // round up
1210-
}
1211-
}
1212-
#endif // GGML_USE_CUBLAS
12131158

12141159
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
12151160
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -1246,8 +1191,8 @@ static void llama_model_load_internal(
12461191

12471192
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
12481193
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1249-
LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
1250-
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1194+
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
1195+
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
12511196
#else
12521197
(void) n_gpu_layers;
12531198
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3314,7 +3259,7 @@ struct llama_context * llama_new_context_with_model(
33143259
ggml_cuda_set_scratch_size(0); // disable scratch
33153260
} else {
33163261
ggml_cuda_set_scratch_size(alloc_size);
3317-
LLAMA_LOG_INFO("%s: allocating %.2f MB VRAM for the scratch buffer\n", __func__, alloc_size / 1024.0 / 1024.0);
3262+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
33183263
}
33193264
#endif
33203265
}

0 commit comments

Comments
 (0)