Skip to content

Commit f4ec97e

Browse files
cebtenzzrepkrmf
authored andcommitted
llama : make quantize example up to 2.7x faster (ggml-org#3115)
1 parent f845559 commit f4ec97e

File tree

1 file changed

+142
-125
lines changed

1 file changed

+142
-125
lines changed

llama.cpp

Lines changed: 142 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -5099,7 +5099,16 @@ void llama_beam_search(llama_context * ctx,
50995099
// quantization
51005100
//
51015101

5102-
static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5102+
template <typename T>
5103+
struct no_init {
5104+
T value;
5105+
no_init() { /* do nothing */ }
5106+
};
5107+
5108+
static void llama_convert_tensor_internal(
5109+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5110+
const size_t nelements, const int nthread
5111+
) {
51035112
if (output.size() < nelements) {
51045113
output.resize(nelements);
51055114
}
@@ -5134,7 +5143,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
51345143
auto blocks_per_thread = nblocks / nthread;
51355144
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
51365145

5137-
std::vector<std::thread> workers;
51385146
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
51395147
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
51405148
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -5147,14 +5155,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
51475155
qtype.to_float(inbuf, outbuf, nels);
51485156
}
51495157
};
5150-
workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5158+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
51515159
in_buff_offs += thr_block_bytes;
51525160
out_buff_offs += thr_elems;
51535161
}
5154-
for (auto & worker : workers) {
5155-
worker.join();
5162+
for (auto & w : workers) { w.join(); }
5163+
workers.clear();
5164+
}
5165+
5166+
#ifdef GGML_USE_K_QUANTS
5167+
static ggml_type get_k_quant_type(
5168+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5169+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5170+
) {
5171+
const std::string name = ggml_get_name(tensor);
5172+
// TODO: avoid hardcoded tensor names - use the TN_* constants
5173+
const auto tn = LLM_TN(model.arch);
5174+
5175+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5176+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5177+
};
5178+
5179+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5180+
int nx = tensor->ne[0];
5181+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5182+
new_type = GGML_TYPE_Q8_0;
5183+
}
5184+
else if (new_type != GGML_TYPE_Q8_0) {
5185+
new_type = GGML_TYPE_Q6_K;
5186+
}
5187+
} else if (name.find("attn_v.weight") != std::string::npos) {
5188+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5189+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5190+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5191+
}
5192+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5193+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5194+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5195+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5196+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5197+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5198+
if (model.type == MODEL_70B) {
5199+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5200+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5201+
// nearly negligible increase in model size by quantizing this tensor with more bits:
5202+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5203+
}
5204+
++*i_attention_wv;
5205+
} else if (name.find("ffn_down.weight") != std::string::npos) {
5206+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5207+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5208+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5209+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5210+
: GGML_TYPE_Q3_K;
5211+
}
5212+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5213+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5214+
}
5215+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5216+
if (model.arch == LLM_ARCH_FALCON) {
5217+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5218+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5219+
} else {
5220+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5221+
}
5222+
}
5223+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5224+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5225+
new_type = GGML_TYPE_Q5_K;
5226+
}
5227+
++*i_feed_forward_w2;
5228+
} else if (name.find("attn_output.weight") != std::string::npos) {
5229+
if (model.arch != LLM_ARCH_FALCON) {
5230+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5231+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5232+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5233+
} else {
5234+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5235+
}
5236+
}
5237+
else if (name.find("attn_qkv.weight") != std::string::npos) {
5238+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5239+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5240+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5241+
}
5242+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5243+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5244+
}
5245+
// This can be used to reduce the size of the Q5_K_S model.
5246+
// The associated PPL increase is fully in line with the size reduction
5247+
//else {
5248+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5249+
//}
5250+
bool convert_incompatible_tensor = false;
5251+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5252+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5253+
int nx = tensor->ne[0];
5254+
int ny = tensor->ne[1];
5255+
if (nx % QK_K != 0) {
5256+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5257+
convert_incompatible_tensor = true;
5258+
}
5259+
}
5260+
if (convert_incompatible_tensor) {
5261+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5262+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5263+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5264+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5265+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5266+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5267+
} else {
5268+
throw std::runtime_error("Unsupported tensor size encountered\n");
5269+
}
51565270
}
5271+
5272+
return new_type;
51575273
}
5274+
#endif
51585275

51595276
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
51605277
ggml_type quantized_type;
@@ -5239,18 +5356,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
52395356
std::vector<int64_t> hist_all(1 << 4, 0);
52405357

52415358
std::vector<std::thread> workers;
5359+
workers.reserve(nthread);
52425360
std::mutex mutex;
52435361

5244-
#ifdef GGML_USE_K_QUANTS
5245-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
5246-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5247-
};
5248-
#endif
5249-
52505362
int idx = 0;
52515363

5252-
std::vector<uint8_t> read_data;
5253-
std::vector<uint8_t> work;
5364+
std::vector<no_init<uint8_t>> read_data;
5365+
std::vector<no_init<uint8_t>> work;
5366+
std::vector<no_init<float>> f32_conv_buf;
52545367

52555368
// populate the original tensors so we get an initial meta data
52565369
for (int i = 0; i < ml->n_tensors; ++i) {
@@ -5272,7 +5385,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
52725385

52735386
const std::string name = ggml_get_name(tensor);
52745387

5275-
read_data.resize(ggml_nbytes(tensor));
5388+
if (read_data.size() < ggml_nbytes(tensor)) {
5389+
read_data.resize(ggml_nbytes(tensor));
5390+
}
52765391
tensor->data = read_data.data();
52775392
ml->load_data_for(tensor);
52785393

@@ -5297,101 +5412,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
52975412
if (quantize) {
52985413
new_type = quantized_type;
52995414
#ifdef GGML_USE_K_QUANTS
5300-
// TODO: avoid hardcoded tensor names - use the TN_* constants
5301-
const auto tn = LLM_TN(ml->get_arch());
5302-
5303-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5304-
int nx = tensor->ne[0];
5305-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5306-
new_type = GGML_TYPE_Q8_0;
5307-
}
5308-
else if (new_type != GGML_TYPE_Q8_0) {
5309-
new_type = GGML_TYPE_Q6_K;
5310-
}
5311-
} else if (name.find("attn_v.weight") != std::string::npos) {
5312-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5313-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5314-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5315-
}
5316-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5317-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5318-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5319-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5320-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5321-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5322-
if (model.type == MODEL_70B) {
5323-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5324-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5325-
// nearly negligible increase in model size by quantizing this tensor with more bits:
5326-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5327-
}
5328-
++i_attention_wv;
5329-
} else if (name.find("ffn_down.weight") != std::string::npos) {
5330-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5331-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5332-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5333-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5334-
: GGML_TYPE_Q3_K;
5335-
}
5336-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5337-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5338-
}
5339-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5340-
if (model.arch == LLM_ARCH_FALCON) {
5341-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5342-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5343-
} else {
5344-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5345-
}
5346-
}
5347-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5348-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
5349-
new_type = GGML_TYPE_Q5_K;
5350-
}
5351-
++i_feed_forward_w2;
5352-
} else if (name.find("attn_output.weight") != std::string::npos) {
5353-
if (model.arch != LLM_ARCH_FALCON) {
5354-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5355-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5356-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5357-
} else {
5358-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5359-
}
5360-
}
5361-
else if (name.find("attn_qkv.weight") != std::string::npos) {
5362-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5363-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5364-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5365-
}
5366-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5367-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5368-
}
5369-
// This can be used to reduce the size of the Q5_K_S model.
5370-
// The associated PPL increase is fully in line with the size reduction
5371-
//else {
5372-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5373-
//}
5374-
bool convert_incompatible_tensor = false;
5375-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5376-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5377-
int nx = tensor->ne[0];
5378-
int ny = tensor->ne[1];
5379-
if (nx % QK_K != 0) {
5380-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5381-
convert_incompatible_tensor = true;
5382-
}
5383-
}
5384-
if (convert_incompatible_tensor) {
5385-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5386-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5387-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5388-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5389-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5390-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5391-
} else {
5392-
throw std::runtime_error("Unsupported tensor size encountered\n");
5393-
}
5394-
}
5415+
new_type = get_k_quant_type(
5416+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5417+
);
53955418
#endif
53965419
// If we've decided to quantize to the same type the tensor is already
53975420
// in then there's nothing to do.
@@ -5406,23 +5429,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
54065429
const size_t nelements = ggml_nelements(tensor);
54075430

54085431
float * f32_data;
5409-
std::vector<float> f32_conv_buf;
54105432

54115433
if (tensor->type == GGML_TYPE_F32) {
54125434
f32_data = (float *) tensor->data;
54135435
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
54145436
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
54155437
} else {
5416-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5438+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
54175439
f32_data = (float *) f32_conv_buf.data();
54185440
}
54195441

54205442
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
54215443
fflush(stdout);
54225444

5423-
work.resize(nelements * 4); // upper bound on size
5445+
if (work.size() < nelements * 4) {
5446+
work.resize(nelements * 4); // upper bound on size
5447+
}
54245448
new_data = work.data();
5425-
std::vector<int64_t> hist_cur(1 << 4, 0);
5449+
std::array<int64_t, 1 << 4> hist_cur = {};
54265450

54275451
static const int chunk_size = 32 * 512;
54285452
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -5433,13 +5457,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
54335457
size_t counter = 0;
54345458
new_size = 0;
54355459
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
5436-
std::vector<int64_t> local_hist;
5460+
std::array<int64_t, 1 << 4> local_hist = {};
54375461
size_t local_size = 0;
54385462
while (true) {
54395463
std::unique_lock<std::mutex> lock(mutex);
54405464
size_t first = counter; counter += chunk_size;
54415465
if (first >= nelements) {
5442-
if (!local_hist.empty()) {
5466+
if (local_size > 0) {
54435467
for (int j=0; j<int(local_hist.size()); ++j) {
54445468
hist_cur[j] += local_hist[j];
54455469
}
@@ -5449,22 +5473,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
54495473
}
54505474
lock.unlock();
54515475
size_t last = std::min(nelements, first + chunk_size);
5452-
if (local_hist.empty()) {
5453-
local_hist.resize(hist_cur.size(), 0);
5454-
}
54555476
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
54565477
}
54575478
};
5458-
if ((int) workers.size() < nthread_use - 1) {
5459-
workers.resize(nthread_use - 1);
5460-
}
54615479
for (int it = 0; it < nthread_use - 1; ++it) {
5462-
workers[it] = std::thread(compute);
5480+
workers.emplace_back(compute);
54635481
}
54645482
compute();
5465-
for (int it = 0; it < nthread_use - 1; ++it) {
5466-
workers[it].join();
5467-
}
5483+
for (auto & w : workers) { w.join(); }
5484+
workers.clear();
54685485
}
54695486

54705487
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);

0 commit comments

Comments
 (0)