@@ -1083,9 +1083,9 @@ enum e_model {
1083
1083
MODEL_70B,
1084
1084
};
1085
1085
1086
- static const size_t kB = 1024 ;
1087
- static const size_t MB = 1024 *kB ;
1088
- static const size_t GB = 1024 *MB;
1086
+ static const size_t kB = 1000 ;
1087
+ static const size_t MB = 1000 *kB ;
1088
+ static const size_t GB = 1000 *MB;
1089
1089
1090
1090
struct llama_hparams {
1091
1091
bool vocab_only;
@@ -1481,7 +1481,7 @@ static bool llama_kv_cache_init(
1481
1481
vram_kv_cache += ggml_nbytes (cache.k );
1482
1482
}
1483
1483
if (vram_kv_cache > 0 ) {
1484
- LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB\n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
1484
+ LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB\n " , __func__, vram_kv_cache / 1e6 );
1485
1485
}
1486
1486
}
1487
1487
#endif
@@ -2520,9 +2520,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2520
2520
LLAMA_LOG_INFO (" %s: model ftype = %s\n " , __func__, llama_model_ftype_name (model.ftype ).c_str ());
2521
2521
LLAMA_LOG_INFO (" %s: model params = %.2f B\n " , __func__, ml.n_elements *1e-9 );
2522
2522
if (ml.n_bytes < GB) {
2523
- LLAMA_LOG_INFO (" %s: model size = %.2f MiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 / 1024.0 , ml.n_bytes *8.0 /ml.n_elements );
2523
+ LLAMA_LOG_INFO (" %s: model size = %.2f MB (%.2f BPW) \n " , __func__, ml.n_bytes /1e6 , ml.n_bytes *8.0 /ml.n_elements );
2524
2524
} else {
2525
- LLAMA_LOG_INFO (" %s: model size = %.2f GiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 / 1024.0 / 1024.0 , ml.n_bytes *8.0 /ml.n_elements );
2525
+ LLAMA_LOG_INFO (" %s: model size = %.2f GB (%.2f BPW) \n " , __func__, ml.n_bytes /1e9 , ml.n_bytes *8.0 /ml.n_elements );
2526
2526
}
2527
2527
2528
2528
// general kv
@@ -2558,7 +2558,7 @@ static void llm_load_tensors(
2558
2558
2559
2559
ml.calc_sizes (ctx_size, mmapped_size);
2560
2560
2561
- LLAMA_LOG_INFO (" %s: ggml ctx size = %7.2f MB\n " , __func__, ctx_size/1024.0 / 1024.0 );
2561
+ LLAMA_LOG_INFO (" %s: ggml ctx size = %7.2f MB\n " , __func__, ctx_size/1e6 );
2562
2562
2563
2563
// create the ggml context
2564
2564
{
@@ -3207,7 +3207,7 @@ static void llm_load_tensors(
3207
3207
ctx_size +
3208
3208
mmapped_size - vram_weights; // weights in VRAM not in memory
3209
3209
3210
- LLAMA_LOG_INFO (" %s: mem required = %7.2f MB\n " , __func__, mem_required / 1024.0 / 1024.0 );
3210
+ LLAMA_LOG_INFO (" %s: mem required = %7.2f MB\n " , __func__, mem_required / 1e6 );
3211
3211
3212
3212
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3213
3213
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
@@ -3226,7 +3226,7 @@ static void llm_load_tensors(
3226
3226
#endif // GGML_USE_CUBLAS
3227
3227
3228
3228
LLAMA_LOG_INFO (" %s: offloaded %d/%d layers to GPU\n " , __func__, std::min (n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3229
- LLAMA_LOG_INFO (" %s: VRAM used: %.2f MB\n " , __func__, vram_weights / 1024.0 / 1024.0 );
3229
+ LLAMA_LOG_INFO (" %s: VRAM used: %.2f MB\n " , __func__, vram_weights / 1e6 );
3230
3230
#else
3231
3231
(void ) n_gpu_layers;
3232
3232
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7878,7 +7878,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7878
7878
new_type = tensor->type ;
7879
7879
new_data = tensor->data ;
7880
7880
new_size = ggml_nbytes (tensor);
7881
- LLAMA_LOG_INFO (" size = %8.3f MB\n " , ggml_nbytes (tensor)/1024.0 / 1024.0 );
7881
+ LLAMA_LOG_INFO (" size = %8.3f MB\n " , ggml_nbytes (tensor)/1e6 );
7882
7882
} else {
7883
7883
const size_t nelements = ggml_nelements (tensor);
7884
7884
@@ -7938,7 +7938,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7938
7938
workers.clear ();
7939
7939
}
7940
7940
7941
- LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1024.0 / 1024.0 , new_size/1024.0 / 1024.0 );
7941
+ LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1e6 , new_size/1e6 );
7942
7942
int64_t tot_count = 0 ;
7943
7943
for (size_t i = 0 ; i < hist_cur.size (); i++) {
7944
7944
hist_all[i] += hist_cur[i];
@@ -7976,8 +7976,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7976
7976
7977
7977
gguf_free (ctx_out);
7978
7978
7979
- LLAMA_LOG_INFO (" %s: model size = %8.2f MB\n " , __func__, total_size_org/1024.0 / 1024.0 );
7980
- LLAMA_LOG_INFO (" %s: quant size = %8.2f MB\n " , __func__, total_size_new/1024.0 / 1024.0 );
7979
+ LLAMA_LOG_INFO (" %s: model size = %8.2f MB\n " , __func__, total_size_org/1e6 );
7980
+ LLAMA_LOG_INFO (" %s: quant size = %8.2f MB\n " , __func__, total_size_new/1e6 );
7981
7981
7982
7982
// print histogram for all tensors
7983
7983
{
@@ -8478,7 +8478,7 @@ struct llama_context * llama_new_context_with_model(
8478
8478
8479
8479
{
8480
8480
const size_t memory_size = ggml_nbytes (ctx->kv_self .k ) + ggml_nbytes (ctx->kv_self .v );
8481
- LLAMA_LOG_INFO (" %s: kv self size = %7.2f MB\n " , __func__, memory_size / 1024.0 / 1024.0 );
8481
+ LLAMA_LOG_INFO (" %s: kv self size = %7.2f MB\n " , __func__, memory_size / 1e6 );
8482
8482
}
8483
8483
8484
8484
// resized during inference
@@ -8523,7 +8523,7 @@ struct llama_context * llama_new_context_with_model(
8523
8523
// measure memory requirements for the graph
8524
8524
size_t alloc_size = ggml_allocr_alloc_graph (ctx->alloc , gf) + tensor_alignment;
8525
8525
8526
- LLAMA_LOG_INFO (" %s: compute buffer total size = %.2f MB\n " , __func__, (ctx->buf_compute .size + alloc_size) / 1024.0 / 1024.0 );
8526
+ LLAMA_LOG_INFO (" %s: compute buffer total size = %.2f MB\n " , __func__, (ctx->buf_compute .size + alloc_size) / 1e6 );
8527
8527
8528
8528
// recreate allocator with exact memory requirements
8529
8529
ggml_allocr_free (ctx->alloc );
@@ -8537,7 +8537,7 @@ struct llama_context * llama_new_context_with_model(
8537
8537
#endif
8538
8538
#ifdef GGML_USE_CUBLAS
8539
8539
ggml_cuda_set_scratch_size (alloc_size);
8540
- LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MB\n " , __func__, alloc_size / 1024.0 / 1024.0 );
8540
+ LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MB\n " , __func__, alloc_size / 1e6 );
8541
8541
8542
8542
// calculate total VRAM usage
8543
8543
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8558,9 +8558,9 @@ struct llama_context * llama_new_context_with_model(
8558
8558
size_t total_vram_size = model_vram_size + ctx_vram_size;
8559
8559
8560
8560
LLAMA_LOG_INFO (" %s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n " , __func__,
8561
- total_vram_size / 1024.0 / 1024.0 ,
8562
- model_vram_size / 1024.0 / 1024.0 ,
8563
- ctx_vram_size / 1024.0 / 1024.0 );
8561
+ total_vram_size / 1e6 ,
8562
+ model_vram_size / 1e6 ,
8563
+ ctx_vram_size / 1e6 );
8564
8564
#endif
8565
8565
}
8566
8566
@@ -8581,7 +8581,7 @@ struct llama_context * llama_new_context_with_model(
8581
8581
8582
8582
const size_t max_size = ggml_get_max_tensor_size (ctx->model .ctx );
8583
8583
8584
- LLAMA_LOG_INFO (" %s: max tensor size = %8.2f MB\n " , __func__, max_size/1024.0 / 1024.0 );
8584
+ LLAMA_LOG_INFO (" %s: max tensor size = %8.2f MB\n " , __func__, max_size/1e6 );
8585
8585
8586
8586
#define LLAMA_METAL_CHECK_BUF (result ) \
8587
8587
if (!(result)) { \
0 commit comments