@@ -4639,7 +4639,10 @@ void llama_beam_search(llama_context * ctx,
4639
4639
// quantization
4640
4640
//
4641
4641
4642
- static void llama_convert_tensor_internal (struct ggml_tensor * tensor, std::vector<float > & output, const size_t nelements, const int nthread) {
4642
+ static void llama_convert_tensor_internal (
4643
+ struct ggml_tensor * tensor, std::vector<float > & output, std::vector<std::thread> & workers,
4644
+ const size_t nelements, const int nthread
4645
+ ) {
4643
4646
if (output.size () < nelements) {
4644
4647
output.resize (nelements);
4645
4648
}
@@ -4674,7 +4677,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4674
4677
auto blocks_per_thread = nblocks / nthread;
4675
4678
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4676
4679
4677
- std::vector<std::thread> workers;
4678
4680
for (auto tnum = 0 , in_buff_offs = 0 , out_buff_offs = 0 ; tnum < nthread; tnum++) {
4679
4681
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0 ); // num blocks for this thread
4680
4682
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4687,13 +4689,12 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4687
4689
qtype.to_float (inbuf, outbuf, nels);
4688
4690
}
4689
4691
};
4690
- workers.push_back ( std::thread ( compute, tensor->type , (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems) );
4692
+ workers.emplace_back ( compute, tensor->type , (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4691
4693
in_buff_offs += thr_block_bytes;
4692
4694
out_buff_offs += thr_elems;
4693
4695
}
4694
- for (auto & worker : workers) {
4695
- worker.join ();
4696
- }
4696
+ for (auto & w : workers) { w.join (); }
4697
+ workers.clear ();
4697
4698
}
4698
4699
4699
4700
#ifdef GGML_USE_K_QUANTS
@@ -4889,12 +4890,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4889
4890
std::vector<int64_t > hist_all (1 << 4 , 0 );
4890
4891
4891
4892
std::vector<std::thread> workers;
4893
+ workers.reserve (nthread);
4892
4894
std::mutex mutex;
4893
4895
4894
4896
int idx = 0 ;
4895
4897
4896
4898
std::vector<uint8_t > read_data;
4897
4899
std::vector<uint8_t > work;
4900
+ std::vector<float > f32_conv_buf;
4898
4901
4899
4902
// populate the original tensors so we get an initial meta data
4900
4903
for (int i = 0 ; i < ml->n_tensors ; ++i) {
@@ -4916,7 +4919,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4916
4919
4917
4920
const std::string name = ggml_get_name (tensor);
4918
4921
4919
- read_data.resize (ggml_nbytes (tensor));
4922
+ if (read_data.size () < ggml_nbytes (tensor)) {
4923
+ read_data.resize (ggml_nbytes (tensor));
4924
+ }
4920
4925
tensor->data = read_data.data ();
4921
4926
ml->load_data_for (tensor);
4922
4927
@@ -4958,23 +4963,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4958
4963
const size_t nelements = ggml_nelements (tensor);
4959
4964
4960
4965
float * f32_data;
4961
- std::vector<float > f32_conv_buf;
4962
4966
4963
4967
if (tensor->type == GGML_TYPE_F32) {
4964
4968
f32_data = (float *) tensor->data ;
4965
4969
} else if (ggml_is_quantized (tensor->type ) && !params->allow_requantize ) {
4966
4970
throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor->type )));
4967
4971
} else {
4968
- llama_convert_tensor_internal (tensor, f32_conv_buf, nelements, nthread);
4972
+ llama_convert_tensor_internal (tensor, f32_conv_buf, workers, nelements, nthread);
4969
4973
f32_data = (float *) f32_conv_buf.data ();
4970
4974
}
4971
4975
4972
4976
LLAMA_LOG_INFO (" quantizing to %s .. " , ggml_type_name (new_type));
4973
4977
fflush (stdout);
4974
4978
4975
- work.resize (nelements * 4 ); // upper bound on size
4979
+ if (work.size () < nelements * 4 ) {
4980
+ work.resize (nelements * 4 ); // upper bound on size
4981
+ }
4976
4982
new_data = work.data ();
4977
- std::vector <int64_t > hist_cur ( 1 << 4 , 0 ) ;
4983
+ std::array <int64_t , 1 << 4 > hist_cur = {} ;
4978
4984
4979
4985
static const int chunk_size = 32 * 512 ;
4980
4986
const int nchunk = (nelements + chunk_size - 1 )/chunk_size;
@@ -4985,13 +4991,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4985
4991
size_t counter = 0 ;
4986
4992
new_size = 0 ;
4987
4993
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4988
- std::vector <int64_t > local_hist;
4994
+ std::array <int64_t , 1 << 4 > local_hist = {} ;
4989
4995
size_t local_size = 0 ;
4990
4996
while (true ) {
4991
4997
std::unique_lock<std::mutex> lock (mutex);
4992
4998
size_t first = counter; counter += chunk_size;
4993
4999
if (first >= nelements) {
4994
- if (!local_hist. empty () ) {
5000
+ if (local_size > 0 ) {
4995
5001
for (int j=0 ; j<int (local_hist.size ()); ++j) {
4996
5002
hist_cur[j] += local_hist[j];
4997
5003
}
@@ -5001,22 +5007,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5001
5007
}
5002
5008
lock.unlock ();
5003
5009
size_t last = std::min (nelements, first + chunk_size);
5004
- if (local_hist.empty ()) {
5005
- local_hist.resize (hist_cur.size (), 0 );
5006
- }
5007
5010
local_size += ggml_quantize_chunk (new_type, f32_data, new_data, first, last - first, local_hist.data ());
5008
5011
}
5009
5012
};
5010
- if ((int ) workers.size () < nthread_use - 1 ) {
5011
- workers.resize (nthread_use - 1 );
5012
- }
5013
5013
for (int it = 0 ; it < nthread_use - 1 ; ++it) {
5014
- workers[it] = std::thread (compute);
5014
+ workers. emplace_back (compute);
5015
5015
}
5016
5016
compute ();
5017
- for (int it = 0 ; it < nthread_use - 1 ; ++it) {
5018
- workers[it].join ();
5019
- }
5017
+ for (auto & w : workers) { w.join (); }
5018
+ workers.clear ();
5020
5019
}
5021
5020
5022
5021
LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1024.0 /1024.0 , new_size/1024.0 /1024.0 );
0 commit comments