Skip to content

Commit fe61460

Browse files
committed
Move llama_model_quantize() into llama.cpp
1 parent b72f8d6 commit fe61460

File tree

4 files changed

+264
-272
lines changed

4 files changed

+264
-272
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,8 @@ main: main.cpp ggml.o utils.o llama.o
198198
$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
199199
./main -h
200200

201-
quantize: quantize.cpp ggml.o utils.o
202-
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
201+
quantize: quantize.cpp ggml.o utils.o llama.o
202+
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
203203

204204
#
205205
# Tests

llama.cpp

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
#include <map>
1010
#include <string>
1111
#include <vector>
12+
#include <regex>
13+
14+
// TODO: move somewhere else
15+
#define QK 32
16+
1217

1318
// determine number of model parts based on the dimension
1419
static const std::map<int, int> LLAMA_N_PARTS = {
@@ -688,3 +693,258 @@ bool llama_eval(
688693

689694
return true;
690695
}
696+
bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
697+
ggml_type type = GGML_TYPE_Q4_1;
698+
699+
switch (itype) {
700+
case 2: type = GGML_TYPE_Q4_0; break;
701+
case 3: type = GGML_TYPE_Q4_1; break;
702+
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
703+
};
704+
705+
if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
706+
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
707+
return false;
708+
}
709+
710+
gpt_vocab vocab;
711+
712+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
713+
714+
auto finp = std::ifstream(fname_inp, std::ios::binary);
715+
if (!finp) {
716+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
717+
return false;
718+
}
719+
720+
auto fout = std::ofstream(fname_out, std::ios::binary);
721+
if (!fout) {
722+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
723+
return false;
724+
}
725+
726+
// verify magic
727+
{
728+
uint32_t magic;
729+
finp.read((char *) &magic, sizeof(magic));
730+
if (magic != 0x67676d6c) {
731+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
732+
return false;
733+
}
734+
735+
fout.write((char *) &magic, sizeof(magic));
736+
}
737+
738+
llama_hparams hparams;
739+
740+
// load hparams
741+
{
742+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
743+
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
744+
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
745+
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
746+
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
747+
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
748+
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
749+
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
750+
751+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
752+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
753+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
754+
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
755+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
756+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
757+
printf("%s: f16 = %d\n", __func__, hparams.f16);
758+
759+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
760+
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
761+
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
762+
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
763+
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
764+
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
765+
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
766+
fout.write((char *) &itype, sizeof(hparams.f16));
767+
}
768+
769+
// load vocab
770+
{
771+
const int32_t n_vocab = hparams.n_vocab;
772+
773+
if (n_vocab != hparams.n_vocab) {
774+
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
775+
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
776+
return false;
777+
}
778+
779+
std::string word;
780+
for (int i = 0; i < n_vocab; i++) {
781+
uint32_t len;
782+
finp.read ((char *) &len, sizeof(len));
783+
fout.write((char *) &len, sizeof(len));
784+
785+
word.resize(len);
786+
finp.read ((char *) word.data(), len);
787+
fout.write((char *) word.data(), len);
788+
789+
vocab.token_to_id[word] = i;
790+
vocab.id_to_token[i] = word;
791+
}
792+
}
793+
794+
// load weights
795+
{
796+
size_t total_size_org = 0;
797+
size_t total_size_new = 0;
798+
799+
std::vector<float> work;
800+
801+
std::vector<uint8_t> data_u8;
802+
std::vector<ggml_fp16_t> data_f16;
803+
std::vector<float> data_f32;
804+
805+
std::vector<int64_t> hist_all(1 << 4, 0);
806+
807+
while (true) {
808+
int32_t n_dims;
809+
int32_t length;
810+
int32_t ftype;
811+
812+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
813+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
814+
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
815+
816+
if (finp.eof()) {
817+
break;
818+
}
819+
820+
int32_t nelements = 1;
821+
int32_t ne[2] = { 1, 1 };
822+
for (int i = 0; i < n_dims; ++i) {
823+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
824+
nelements *= ne[i];
825+
}
826+
827+
std::string name(length, 0);
828+
finp.read (&name[0], length);
829+
830+
{
831+
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
832+
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
833+
}
834+
835+
// regexes of tensor names to be quantized
836+
const std::vector<std::string> k_names = {
837+
".*weight",
838+
};
839+
840+
bool quantize = false;
841+
for (const auto & s : k_names) {
842+
if (std::regex_match(name, std::regex(s))) {
843+
quantize = true;
844+
break;
845+
}
846+
}
847+
848+
// quantize only 2D tensors
849+
quantize &= (n_dims == 2);
850+
851+
if (quantize) {
852+
if (ftype != 0 && ftype != 1) {
853+
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
854+
return false;
855+
}
856+
857+
if (ftype == 1) {
858+
data_f16.resize(nelements);
859+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
860+
data_f32.resize(nelements);
861+
for (int i = 0; i < nelements; ++i) {
862+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
863+
}
864+
} else {
865+
data_f32.resize(nelements);
866+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
867+
}
868+
869+
ftype = itype;
870+
} else {
871+
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
872+
873+
data_u8.resize(nelements*bpe);
874+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
875+
}
876+
877+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
878+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
879+
fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
880+
for (int i = 0; i < n_dims; ++i) {
881+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
882+
}
883+
fout.write(&name[0], length);
884+
885+
if (quantize) {
886+
printf("quantizing .. ");
887+
work.resize(nelements); // for quantization
888+
889+
size_t cur_size = 0;
890+
std::vector<int64_t> hist_cur(1 << 4, 0);
891+
892+
switch (type) {
893+
case GGML_TYPE_Q4_0:
894+
{
895+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
896+
} break;
897+
case GGML_TYPE_Q4_1:
898+
{
899+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
900+
} break;
901+
default:
902+
{
903+
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
904+
return false;
905+
}
906+
}
907+
908+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
909+
total_size_new += cur_size;
910+
911+
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
912+
for (int i = 0; i < hist_cur.size(); ++i) {
913+
hist_all[i] += hist_cur[i];
914+
}
915+
916+
for (int i = 0; i < hist_cur.size(); ++i) {
917+
printf("%5.3f ", hist_cur[i] / (float)nelements);
918+
}
919+
printf("\n");
920+
} else {
921+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
922+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
923+
total_size_new += data_u8.size();
924+
}
925+
926+
total_size_org += nelements * sizeof(float);
927+
}
928+
929+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
930+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
931+
932+
{
933+
int64_t sum_all = 0;
934+
for (int i = 0; i < hist_all.size(); ++i) {
935+
sum_all += hist_all[i];
936+
}
937+
938+
printf("%s: hist: ", __func__);
939+
for (int i = 0; i < hist_all.size(); ++i) {
940+
printf("%5.3f ", hist_all[i] / (float)sum_all);
941+
}
942+
printf("\n");
943+
}
944+
}
945+
946+
finp.close();
947+
fout.close();
948+
949+
return true;
950+
}

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,4 @@ bool llama_eval(
6464
const std::vector<gpt_vocab::id> & embd_inp,
6565
std::vector<float> & embd_w,
6666
size_t & mem_per_token);
67+
bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype);

0 commit comments

Comments
 (0)