@@ -5099,7 +5099,16 @@ void llama_beam_search(llama_context * ctx,
5099
5099
// quantization
5100
5100
//
5101
5101
5102
- static void llama_convert_tensor_internal (struct ggml_tensor * tensor, std::vector<float > & output, const size_t nelements, const int nthread) {
5102
+ template <typename T>
5103
+ struct no_init {
5104
+ T value;
5105
+ no_init () { /* do nothing */ }
5106
+ };
5107
+
5108
+ static void llama_convert_tensor_internal (
5109
+ struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
5110
+ const size_t nelements, const int nthread
5111
+ ) {
5103
5112
if (output.size () < nelements) {
5104
5113
output.resize (nelements);
5105
5114
}
@@ -5134,7 +5143,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
5134
5143
auto blocks_per_thread = nblocks / nthread;
5135
5144
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
5136
5145
5137
- std::vector<std::thread> workers;
5138
5146
for (auto tnum = 0 , in_buff_offs = 0 , out_buff_offs = 0 ; tnum < nthread; tnum++) {
5139
5147
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0 ); // num blocks for this thread
5140
5148
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -5147,14 +5155,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
5147
5155
qtype.to_float (inbuf, outbuf, nels);
5148
5156
}
5149
5157
};
5150
- workers.push_back ( std::thread ( compute, tensor->type , (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems) );
5158
+ workers.emplace_back ( compute, tensor->type , (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
5151
5159
in_buff_offs += thr_block_bytes;
5152
5160
out_buff_offs += thr_elems;
5153
5161
}
5154
- for (auto & worker : workers) {
5155
- worker.join ();
5162
+ for (auto & w : workers) { w.join (); }
5163
+ workers.clear ();
5164
+ }
5165
+
5166
+ #ifdef GGML_USE_K_QUANTS
5167
+ static ggml_type get_k_quant_type (
5168
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5169
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5170
+ ) {
5171
+ const std::string name = ggml_get_name (tensor);
5172
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5173
+ const auto tn = LLM_TN (model.arch );
5174
+
5175
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5176
+ return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
5177
+ };
5178
+
5179
+ if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
5180
+ int nx = tensor->ne [0 ];
5181
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
5182
+ new_type = GGML_TYPE_Q8_0;
5183
+ }
5184
+ else if (new_type != GGML_TYPE_Q8_0) {
5185
+ new_type = GGML_TYPE_Q6_K;
5186
+ }
5187
+ } else if (name.find (" attn_v.weight" ) != std::string::npos) {
5188
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5189
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5190
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5191
+ }
5192
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5193
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5194
+ use_more_bits (*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5195
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
5196
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5197
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
5198
+ if (model.type == MODEL_70B) {
5199
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5200
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5201
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5202
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5203
+ }
5204
+ ++*i_attention_wv;
5205
+ } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
5206
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5207
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5208
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5209
+ : model.arch != LLM_ARCH_FALCON || use_more_bits (*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5210
+ : GGML_TYPE_Q3_K;
5211
+ }
5212
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5213
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5214
+ }
5215
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5216
+ if (model.arch == LLM_ARCH_FALCON) {
5217
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5218
+ use_more_bits (*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5219
+ } else {
5220
+ if (use_more_bits (*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5221
+ }
5222
+ }
5223
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5224
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4 ) {
5225
+ new_type = GGML_TYPE_Q5_K;
5226
+ }
5227
+ ++*i_feed_forward_w2;
5228
+ } else if (name.find (" attn_output.weight" ) != std::string::npos) {
5229
+ if (model.arch != LLM_ARCH_FALCON) {
5230
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5231
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5232
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5233
+ } else {
5234
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5235
+ }
5236
+ }
5237
+ else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
5238
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5239
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5240
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5241
+ }
5242
+ else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
5243
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5244
+ }
5245
+ // This can be used to reduce the size of the Q5_K_S model.
5246
+ // The associated PPL increase is fully in line with the size reduction
5247
+ // else {
5248
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5249
+ // }
5250
+ bool convert_incompatible_tensor = false ;
5251
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5252
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5253
+ int nx = tensor->ne [0 ];
5254
+ int ny = tensor->ne [1 ];
5255
+ if (nx % QK_K != 0 ) {
5256
+ LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants\n " , __func__, nx, ny, QK_K);
5257
+ convert_incompatible_tensor = true ;
5258
+ }
5259
+ }
5260
+ if (convert_incompatible_tensor) {
5261
+ if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
5262
+ new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
5263
+ LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
5264
+ } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
5265
+ new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
5266
+ LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
5267
+ } else {
5268
+ throw std::runtime_error (" Unsupported tensor size encountered\n " );
5269
+ }
5156
5270
}
5271
+
5272
+ return new_type;
5157
5273
}
5274
+ #endif
5158
5275
5159
5276
static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
5160
5277
ggml_type quantized_type;
@@ -5239,18 +5356,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5239
5356
std::vector<int64_t > hist_all (1 << 4 , 0 );
5240
5357
5241
5358
std::vector<std::thread> workers;
5359
+ workers.reserve (nthread);
5242
5360
std::mutex mutex;
5243
5361
5244
- #ifdef GGML_USE_K_QUANTS
5245
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
5246
- return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
5247
- };
5248
- #endif
5249
-
5250
5362
int idx = 0 ;
5251
5363
5252
- std::vector<uint8_t > read_data;
5253
- std::vector<uint8_t > work;
5364
+ std::vector<no_init<uint8_t >> read_data;
5365
+ std::vector<no_init<uint8_t >> work;
5366
+ std::vector<no_init<float >> f32_conv_buf;
5254
5367
5255
5368
// populate the original tensors so we get an initial meta data
5256
5369
for (int i = 0 ; i < ml->n_tensors ; ++i) {
@@ -5272,7 +5385,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5272
5385
5273
5386
const std::string name = ggml_get_name (tensor);
5274
5387
5275
- read_data.resize (ggml_nbytes (tensor));
5388
+ if (read_data.size () < ggml_nbytes (tensor)) {
5389
+ read_data.resize (ggml_nbytes (tensor));
5390
+ }
5276
5391
tensor->data = read_data.data ();
5277
5392
ml->load_data_for (tensor);
5278
5393
@@ -5297,101 +5412,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5297
5412
if (quantize) {
5298
5413
new_type = quantized_type;
5299
5414
#ifdef GGML_USE_K_QUANTS
5300
- // TODO: avoid hardcoded tensor names - use the TN_* constants
5301
- const auto tn = LLM_TN (ml->get_arch ());
5302
-
5303
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
5304
- int nx = tensor->ne [0 ];
5305
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
5306
- new_type = GGML_TYPE_Q8_0;
5307
- }
5308
- else if (new_type != GGML_TYPE_Q8_0) {
5309
- new_type = GGML_TYPE_Q6_K;
5310
- }
5311
- } else if (name.find (" attn_v.weight" ) != std::string::npos) {
5312
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5313
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5314
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5315
- }
5316
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5317
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5318
- use_more_bits (i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5319
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
5320
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5321
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
5322
- if (model.type == MODEL_70B) {
5323
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5324
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5325
- // nearly negligible increase in model size by quantizing this tensor with more bits:
5326
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5327
- }
5328
- ++i_attention_wv;
5329
- } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
5330
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5331
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5332
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5333
- : model.arch != LLM_ARCH_FALCON || use_more_bits (i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5334
- : GGML_TYPE_Q3_K;
5335
- }
5336
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5337
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5338
- }
5339
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5340
- if (model.arch == LLM_ARCH_FALCON) {
5341
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5342
- use_more_bits (i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5343
- } else {
5344
- if (use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5345
- }
5346
- }
5347
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5348
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4 ) {
5349
- new_type = GGML_TYPE_Q5_K;
5350
- }
5351
- ++i_feed_forward_w2;
5352
- } else if (name.find (" attn_output.weight" ) != std::string::npos) {
5353
- if (model.arch != LLM_ARCH_FALCON) {
5354
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5355
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5356
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5357
- } else {
5358
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5359
- }
5360
- }
5361
- else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
5362
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5363
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5364
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5365
- }
5366
- else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
5367
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5368
- }
5369
- // This can be used to reduce the size of the Q5_K_S model.
5370
- // The associated PPL increase is fully in line with the size reduction
5371
- // else {
5372
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5373
- // }
5374
- bool convert_incompatible_tensor = false ;
5375
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5376
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5377
- int nx = tensor->ne [0 ];
5378
- int ny = tensor->ne [1 ];
5379
- if (nx % QK_K != 0 ) {
5380
- LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants\n " , __func__, nx, ny, QK_K);
5381
- convert_incompatible_tensor = true ;
5382
- }
5383
- }
5384
- if (convert_incompatible_tensor) {
5385
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
5386
- new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
5387
- LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
5388
- } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
5389
- new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
5390
- LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
5391
- } else {
5392
- throw std::runtime_error (" Unsupported tensor size encountered\n " );
5393
- }
5394
- }
5415
+ new_type = get_k_quant_type (
5416
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5417
+ );
5395
5418
#endif
5396
5419
// If we've decided to quantize to the same type the tensor is already
5397
5420
// in then there's nothing to do.
@@ -5406,23 +5429,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5406
5429
const size_t nelements = ggml_nelements (tensor);
5407
5430
5408
5431
float * f32_data;
5409
- std::vector<float > f32_conv_buf;
5410
5432
5411
5433
if (tensor->type == GGML_TYPE_F32) {
5412
5434
f32_data = (float *) tensor->data ;
5413
5435
} else if (ggml_is_quantized (tensor->type ) && !params->allow_requantize ) {
5414
5436
throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor->type )));
5415
5437
} else {
5416
- llama_convert_tensor_internal (tensor, f32_conv_buf, nelements, nthread);
5438
+ llama_convert_tensor_internal (tensor, f32_conv_buf, workers, nelements, nthread);
5417
5439
f32_data = (float *) f32_conv_buf.data ();
5418
5440
}
5419
5441
5420
5442
LLAMA_LOG_INFO (" quantizing to %s .. " , ggml_type_name (new_type));
5421
5443
fflush (stdout);
5422
5444
5423
- work.resize (nelements * 4 ); // upper bound on size
5445
+ if (work.size () < nelements * 4 ) {
5446
+ work.resize (nelements * 4 ); // upper bound on size
5447
+ }
5424
5448
new_data = work.data ();
5425
- std::vector <int64_t > hist_cur ( 1 << 4 , 0 ) ;
5449
+ std::array <int64_t , 1 << 4 > hist_cur = {} ;
5426
5450
5427
5451
static const int chunk_size = 32 * 512 ;
5428
5452
const int nchunk = (nelements + chunk_size - 1 )/chunk_size;
@@ -5433,13 +5457,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5433
5457
size_t counter = 0 ;
5434
5458
new_size = 0 ;
5435
5459
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
5436
- std::vector <int64_t > local_hist;
5460
+ std::array <int64_t , 1 << 4 > local_hist = {} ;
5437
5461
size_t local_size = 0 ;
5438
5462
while (true ) {
5439
5463
std::unique_lock<std::mutex> lock (mutex);
5440
5464
size_t first = counter; counter += chunk_size;
5441
5465
if (first >= nelements) {
5442
- if (!local_hist. empty () ) {
5466
+ if (local_size > 0 ) {
5443
5467
for (int j=0 ; j<int (local_hist.size ()); ++j) {
5444
5468
hist_cur[j] += local_hist[j];
5445
5469
}
@@ -5449,22 +5473,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5449
5473
}
5450
5474
lock.unlock ();
5451
5475
size_t last = std::min (nelements, first + chunk_size);
5452
- if (local_hist.empty ()) {
5453
- local_hist.resize (hist_cur.size (), 0 );
5454
- }
5455
5476
local_size += ggml_quantize_chunk (new_type, f32_data, new_data, first, last - first, local_hist.data ());
5456
5477
}
5457
5478
};
5458
- if ((int ) workers.size () < nthread_use - 1 ) {
5459
- workers.resize (nthread_use - 1 );
5460
- }
5461
5479
for (int it = 0 ; it < nthread_use - 1 ; ++it) {
5462
- workers[it] = std::thread (compute);
5480
+ workers. emplace_back (compute);
5463
5481
}
5464
5482
compute ();
5465
- for (int it = 0 ; it < nthread_use - 1 ; ++it) {
5466
- workers[it].join ();
5467
- }
5483
+ for (auto & w : workers) { w.join (); }
5484
+ workers.clear ();
5468
5485
}
5469
5486
5470
5487
LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1024.0 /1024.0 , new_size/1024.0 /1024.0 );
0 commit comments