@@ -22,7 +22,7 @@ static void zeros(std::ofstream & file, size_t n) {
22
22
}
23
23
}
24
24
25
- struct quantize_state_internal {
25
+ struct quantize_state_impl {
26
26
const llama_model & model;
27
27
const llama_model_quantize_params * params;
28
28
@@ -43,13 +43,13 @@ struct quantize_state_internal {
43
43
// used to figure out if a model shares tok_embd with the output weight
44
44
bool has_output = false ;
45
45
46
- quantize_state_internal (const llama_model & model, const llama_model_quantize_params * params)
46
+ quantize_state_impl (const llama_model & model, const llama_model_quantize_params * params)
47
47
: model(model)
48
48
, params(params)
49
49
{}
50
50
};
51
51
52
- static void llama_tensor_dequantize_internal (
52
+ static void llama_tensor_dequantize_impl (
53
53
struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
54
54
const size_t nelements, const int nthread
55
55
) {
@@ -121,7 +121,7 @@ static void llama_tensor_dequantize_internal(
121
121
workers.clear ();
122
122
}
123
123
124
- static ggml_type llama_tensor_get_type (quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
124
+ static ggml_type llama_tensor_get_type (quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
125
125
const std::string name = ggml_get_name (tensor);
126
126
127
127
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -410,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
410
410
return new_type;
411
411
}
412
412
413
- static size_t llama_tensor_quantize_internal (enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
413
+ static size_t llama_tensor_quantize_impl (enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
414
414
if (nthread < 2 ) {
415
415
// single-thread
416
416
size_t new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nrows, n_per_row, imatrix);
@@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
464
464
return new_size;
465
465
}
466
466
467
- static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
467
+ static void llama_model_quantize_impl (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
468
468
ggml_type default_type;
469
469
llama_ftype ftype = params->ftype ;
470
470
@@ -534,7 +534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
534
534
llm_load_hparams (ml, model);
535
535
llm_load_stats (ml, model);
536
536
537
- struct quantize_state_internal qs (model, params);
537
+ struct quantize_state_impl qs (model, params);
538
538
539
539
if (params->only_copy ) {
540
540
ftype = model.ftype ;
@@ -837,7 +837,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
837
837
} else if (ggml_is_quantized (tensor->type ) && !params->allow_requantize ) {
838
838
throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor->type )));
839
839
} else {
840
- llama_tensor_dequantize_internal (tensor, f32_conv_buf, workers, nelements, nthread);
840
+ llama_tensor_dequantize_impl (tensor, f32_conv_buf, workers, nelements, nthread);
841
841
f32_data = (float *) f32_conv_buf.data ();
842
842
}
843
843
@@ -866,7 +866,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
866
866
void * new_data_03 = (char *)new_data + ggml_row_size (new_type, n_per_row) * i03 * nrows;
867
867
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr ;
868
868
869
- new_size += llama_tensor_quantize_internal (new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
869
+ new_size += llama_tensor_quantize_impl (new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
870
870
}
871
871
LLAMA_LOG_INFO (" size = %8.2f MiB -> %8.2f MiB\n " , ggml_nbytes (tensor)/1024.0 /1024.0 , new_size/1024.0 /1024.0 );
872
872
}
@@ -919,7 +919,7 @@ uint32_t llama_model_quantize(
919
919
const char * fname_out,
920
920
const llama_model_quantize_params * params) {
921
921
try {
922
- llama_model_quantize_internal (fname_inp, fname_out, params);
922
+ llama_model_quantize_impl (fname_inp, fname_out, params);
923
923
} catch (const std::exception & err) {
924
924
LLAMA_LOG_ERROR (" %s: failed to quantize: %s\n " , __func__, err.what ());
925
925
return 1 ;
0 commit comments