Skip to content

Commit a95aa21

Browse files
committed
llama : optimize vector use in quantize -> 179% faster
1 parent 0c64968 commit a95aa21

File tree

1 file changed

+22
-23
lines changed

1 file changed

+22
-23
lines changed

llama.cpp

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4639,7 +4639,10 @@ void llama_beam_search(llama_context * ctx,
46394639
// quantization
46404640
//
46414641

4642-
static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
4642+
static void llama_convert_tensor_internal(
4643+
struct ggml_tensor * tensor, std::vector<float> & output, std::vector<std::thread> & workers,
4644+
const size_t nelements, const int nthread
4645+
) {
46434646
if (output.size() < nelements) {
46444647
output.resize(nelements);
46454648
}
@@ -4674,7 +4677,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
46744677
auto blocks_per_thread = nblocks / nthread;
46754678
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
46764679

4677-
std::vector<std::thread> workers;
46784680
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
46794681
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
46804682
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4687,13 +4689,12 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
46874689
qtype.to_float(inbuf, outbuf, nels);
46884690
}
46894691
};
4690-
workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
4692+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
46914693
in_buff_offs += thr_block_bytes;
46924694
out_buff_offs += thr_elems;
46934695
}
4694-
for (auto & worker : workers) {
4695-
worker.join();
4696-
}
4696+
for (auto & w : workers) { w.join(); }
4697+
workers.clear();
46974698
}
46984699

46994700
#ifdef GGML_USE_K_QUANTS
@@ -4889,12 +4890,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48894890
std::vector<int64_t> hist_all(1 << 4, 0);
48904891

48914892
std::vector<std::thread> workers;
4893+
workers.reserve(nthread);
48924894
std::mutex mutex;
48934895

48944896
int idx = 0;
48954897

48964898
std::vector<uint8_t> read_data;
48974899
std::vector<uint8_t> work;
4900+
std::vector<float> f32_conv_buf;
48984901

48994902
// populate the original tensors so we get an initial meta data
49004903
for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4916,7 +4919,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49164919

49174920
const std::string name = ggml_get_name(tensor);
49184921

4919-
read_data.resize(ggml_nbytes(tensor));
4922+
if (read_data.size() < ggml_nbytes(tensor)) {
4923+
read_data.resize(ggml_nbytes(tensor));
4924+
}
49204925
tensor->data = read_data.data();
49214926
ml->load_data_for(tensor);
49224927

@@ -4958,23 +4963,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49584963
const size_t nelements = ggml_nelements(tensor);
49594964

49604965
float * f32_data;
4961-
std::vector<float> f32_conv_buf;
49624966

49634967
if (tensor->type == GGML_TYPE_F32) {
49644968
f32_data = (float *) tensor->data;
49654969
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
49664970
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
49674971
} else {
4968-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
4972+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
49694973
f32_data = (float *) f32_conv_buf.data();
49704974
}
49714975

49724976
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
49734977
fflush(stdout);
49744978

4975-
work.resize(nelements * 4); // upper bound on size
4979+
if (work.size() < nelements * 4) {
4980+
work.resize(nelements * 4); // upper bound on size
4981+
}
49764982
new_data = work.data();
4977-
std::vector<int64_t> hist_cur(1 << 4, 0);
4983+
std::array<int64_t, 1 << 4> hist_cur = {};
49784984

49794985
static const int chunk_size = 32 * 512;
49804986
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4985,13 +4991,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49854991
size_t counter = 0;
49864992
new_size = 0;
49874993
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4988-
std::vector<int64_t> local_hist;
4994+
std::array<int64_t, 1 << 4> local_hist = {};
49894995
size_t local_size = 0;
49904996
while (true) {
49914997
std::unique_lock<std::mutex> lock(mutex);
49924998
size_t first = counter; counter += chunk_size;
49934999
if (first >= nelements) {
4994-
if (!local_hist.empty()) {
5000+
if (local_size > 0) {
49955001
for (int j=0; j<int(local_hist.size()); ++j) {
49965002
hist_cur[j] += local_hist[j];
49975003
}
@@ -5001,22 +5007,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
50015007
}
50025008
lock.unlock();
50035009
size_t last = std::min(nelements, first + chunk_size);
5004-
if (local_hist.empty()) {
5005-
local_hist.resize(hist_cur.size(), 0);
5006-
}
50075010
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
50085011
}
50095012
};
5010-
if ((int) workers.size() < nthread_use - 1) {
5011-
workers.resize(nthread_use - 1);
5012-
}
50135013
for (int it = 0; it < nthread_use - 1; ++it) {
5014-
workers[it] = std::thread(compute);
5014+
workers.emplace_back(compute);
50155015
}
50165016
compute();
5017-
for (int it = 0; it < nthread_use - 1; ++it) {
5018-
workers[it].join();
5019-
}
5017+
for (auto & w : workers) { w.join(); }
5018+
workers.clear();
50205019
}
50215020

50225021
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);

0 commit comments

Comments
 (0)