Skip to content

Commit 2dab8f5

Browse files
committed
Add enum llama_ftype, sync ggml_type to model files
1 parent 3416298 commit 2dab8f5

File tree

5 files changed

+91
-78
lines changed

5 files changed

+91
-78
lines changed

examples/quantize/quantize.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ int main(int argc, char ** argv) {
1212

1313
if (argc != 4) {
1414
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
15-
fprintf(stderr, " type = 2 - q4_0\n");
16-
fprintf(stderr, " type = 3 - q4_1\n");
15+
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
16+
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
1717
return 1;
1818
}
1919

@@ -27,7 +27,15 @@ int main(int argc, char ** argv) {
2727
const std::string fname_inp = argv[1];
2828
const std::string fname_out = argv[2];
2929

30-
const int itype = atoi(argv[3]);
30+
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
31+
switch (ftype) {
32+
case LLAMA_FTYPE_MOSTLY_Q4_0:
33+
case LLAMA_FTYPE_MOSTLY_Q4_1:
34+
break;
35+
default:
36+
fprintf(stderr, "Invalid model file type %d\n", ftype);
37+
return 1;
38+
}
3139

3240
const int64_t t_main_start_us = ggml_time_us();
3341

@@ -37,7 +45,7 @@ int main(int argc, char ** argv) {
3745
{
3846
const int64_t t_start_us = ggml_time_us();
3947

40-
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
48+
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
4149
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
4250
return 1;
4351
}

ggml.c

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2578,29 +2578,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
25782578
//
25792579

25802580
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
2581-
QK,
2582-
QK,
2583-
1,
2584-
1,
2585-
1,
2586-
1,
2587-
1,
2581+
[GGML_TYPE_F32] = 1,
2582+
[GGML_TYPE_F16] = 1,
2583+
[GGML_TYPE_Q4_0] = QK,
2584+
[GGML_TYPE_Q4_1] = QK,
2585+
[GGML_TYPE_I8] = 1,
2586+
[GGML_TYPE_I16] = 1,
2587+
[GGML_TYPE_I32] = 1,
25882588
};
2589-
2590-
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
2589+
static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated");
25912590

25922591
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
2593-
sizeof(block_q4_0),
2594-
sizeof(block_q4_1),
2595-
sizeof(int8_t ),
2596-
sizeof(int16_t),
2597-
sizeof(int32_t),
2598-
sizeof(ggml_fp16_t),
2599-
sizeof(float ),
2592+
[GGML_TYPE_F32] = sizeof(float),
2593+
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
2594+
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
2595+
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
2596+
[GGML_TYPE_I8] = sizeof(int8_t),
2597+
[GGML_TYPE_I16] = sizeof(int16_t),
2598+
[GGML_TYPE_I32] = sizeof(int32_t),
26002599
};
2601-
2602-
// don't forget to update the array above when adding new types
2603-
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
2600+
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
26042601

26052602
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
26062603
"NONE",

ggml.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,13 +198,14 @@ struct ggml_object;
198198
struct ggml_context;
199199

200200
enum ggml_type {
201-
GGML_TYPE_Q4_0,
202-
GGML_TYPE_Q4_1,
201+
// explicitly numbered values are used in llama.cpp files
202+
GGML_TYPE_F32 = 0,
203+
GGML_TYPE_F16 = 1,
204+
GGML_TYPE_Q4_0 = 2,
205+
GGML_TYPE_Q4_1 = 3,
203206
GGML_TYPE_I8,
204207
GGML_TYPE_I16,
205208
GGML_TYPE_I32,
206-
GGML_TYPE_F16,
207-
GGML_TYPE_F32,
208209
GGML_TYPE_COUNT,
209210
};
210211

llama.cpp

Lines changed: 48 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
} \
3737
} while (0)
3838

39+
static const char * ttype_str[] = { "f32", "f16", "q4_0", "q4_1" };
3940

4041
// determine number of model parts based on the dimension
4142
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
@@ -100,7 +101,7 @@ struct llama_hparams {
100101
int32_t n_head = 32;
101102
int32_t n_layer = 32;
102103
int32_t n_rot = 64;
103-
int32_t f16 = 1;
104+
int32_t ftype = LLAMA_FTYPE_MOSTLY_F16;
104105
};
105106

106107
struct llama_layer {
@@ -424,7 +425,7 @@ static bool llama_model_load(
424425
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
425426
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
426427
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
427-
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
428+
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
428429

429430
hparams.n_ctx = n_ctx;
430431

@@ -435,7 +436,7 @@ static bool llama_model_load(
435436
}
436437

437438
// temp warning to tell the user to use "--n_parts"
438-
if (hparams.f16 == 4 && n_parts != 1) {
439+
if (hparams.ftype == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1) {
439440
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
440441
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
441442
}
@@ -463,7 +464,7 @@ static bool llama_model_load(
463464
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
464465
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
465466
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
466-
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
467+
fprintf(stderr, "%s: ftype = %d\n", __func__, hparams.ftype);
467468
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
468469
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
469470
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
@@ -507,16 +508,19 @@ static bool llama_model_load(
507508
// in order to save memory and also to speed up the computation
508509
// wtype is for per-layer weights, while vtype is for other weights
509510
ggml_type wtype, vtype;
510-
switch (model.hparams.f16) {
511-
case 0: wtype = vtype = GGML_TYPE_F32; break;
512-
case 1: wtype = vtype = GGML_TYPE_F16; break;
513-
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
514-
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
515-
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
511+
switch (model.hparams.ftype) {
512+
case LLAMA_FTYPE_ALL_F32: wtype = vtype = GGML_TYPE_F32; break;
513+
case LLAMA_FTYPE_MOSTLY_F16: wtype = vtype = GGML_TYPE_F16; break;
514+
case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break;
515+
case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break;
516+
case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
517+
wtype = GGML_TYPE_Q4_1;
518+
vtype = GGML_TYPE_F16;
519+
break;
516520
default:
517521
{
518-
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
519-
__func__, fname.c_str(), model.hparams.f16);
522+
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
523+
__func__, fname.c_str(), model.hparams.ftype);
520524
return false;
521525
}
522526
}
@@ -647,11 +651,11 @@ static bool llama_model_load(
647651
while (true) {
648652
int32_t n_dims;
649653
int32_t length;
650-
int32_t ftype;
654+
int32_t ttype;
651655

652656
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
653657
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654-
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
658+
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
655659

656660
if (fin.eof()) {
657661
break;
@@ -684,20 +688,19 @@ static bool llama_model_load(
684688
return false;
685689
}
686690
if (0) {
687-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
688-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
691+
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ttype_str[ttype]);
689692
}
690693

691-
switch (ftype) {
692-
case 0: // f32
693-
case 1: // f16
694+
switch (ttype) {
695+
case GGML_TYPE_F32:
696+
case GGML_TYPE_F16:
694697
break;
695-
case 2: // q4_0
696-
case 3: // q4_1
698+
case GGML_TYPE_Q4_0:
699+
case GGML_TYPE_Q4_1:
697700
assert(ne[0] % 64 == 0);
698701
break;
699702
default:
700-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
703+
fprintf(stderr, "%s: unknown tensor type %d in model file\n", __func__, ttype);
701704
return false;
702705
};
703706

@@ -1289,20 +1292,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
12891292
//
12901293

12911294
// TODO: reuse code from the llama_model_load() somehow
1292-
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1293-
ggml_type type = GGML_TYPE_Q4_1;
1295+
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1296+
ggml_type qtype;
12941297

1295-
switch (itype) {
1296-
case 2: type = GGML_TYPE_Q4_0; break;
1297-
case 3: type = GGML_TYPE_Q4_1; break;
1298-
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
1298+
switch (ftype) {
1299+
case LLAMA_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
1300+
case LLAMA_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
1301+
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, ftype); return false;
12991302
};
13001303

1301-
if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1302-
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
1303-
return false;
1304-
}
1305-
13061304
llama_vocab vocab;
13071305

13081306
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -1357,15 +1355,15 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
13571355
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
13581356
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
13591357
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1360-
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
1358+
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
13611359

13621360
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
13631361
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
13641362
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
13651363
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
13661364
printf("%s: n_head = %d\n", __func__, hparams.n_head);
13671365
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
1368-
printf("%s: f16 = %d\n", __func__, hparams.f16);
1366+
printf("%s: ftype = %d\n", __func__, hparams.ftype);
13691367

13701368
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
13711369
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@@ -1374,7 +1372,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
13741372
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
13751373
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
13761374
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
1377-
fout.write((char *) &itype, sizeof(hparams.f16));
1375+
int32_t iftype = ftype;
1376+
fout.write((char *) &iftype, sizeof(hparams.ftype));
13781377
}
13791378

13801379
// load vocab
@@ -1426,11 +1425,11 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
14261425
while (true) {
14271426
int32_t n_dims;
14281427
int32_t length;
1429-
int32_t ftype;
1428+
int32_t ttype;
14301429

14311430
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
14321431
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
1433-
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1432+
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
14341433

14351434
if (finp.eof()) {
14361435
break;
@@ -1454,8 +1453,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
14541453
}
14551454

14561455
{
1457-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
1458-
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
1456+
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ttype_str[ttype]);
14591457
}
14601458

14611459
// regexes of tensor names to be quantized
@@ -1475,12 +1473,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
14751473
quantize &= (n_dims == 2);
14761474

14771475
if (quantize) {
1478-
if (ftype != 0 && ftype != 1) {
1479-
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
1476+
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
1477+
fprintf(stderr, "%s: unsupported tensor type %d for integer quantization\n", __func__, ttype);
14801478
return false;
14811479
}
14821480

1483-
if (ftype == 1) {
1481+
if (ttype == GGML_TYPE_F16) {
14841482
data_f16.resize(nelements);
14851483
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
14861484
data_f32.resize(nelements);
@@ -1492,17 +1490,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
14921490
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
14931491
}
14941492

1495-
ftype = itype;
1493+
ttype = qtype;
14961494
} else {
1497-
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
1495+
const int bpe = ggml_type_size((ggml_type)ttype);
14981496

14991497
data_u8.resize(nelements*bpe);
15001498
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
15011499
}
15021500

15031501
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
15041502
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
1505-
fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1503+
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
15061504
for (int i = 0; i < n_dims; ++i) {
15071505
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
15081506
}
@@ -1522,7 +1520,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
15221520
size_t cur_size = 0;
15231521
std::vector<int64_t> hist_cur(1 << 4, 0);
15241522

1525-
switch (type) {
1523+
switch (qtype) {
15261524
case GGML_TYPE_Q4_0:
15271525
{
15281526
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
@@ -1533,7 +1531,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
15331531
} break;
15341532
default:
15351533
{
1536-
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
1534+
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, qtype);
15371535
return false;
15381536
}
15391537
}
@@ -1675,8 +1673,8 @@ void llama_free(struct llama_context * ctx) {
16751673
int llama_model_quantize(
16761674
const char * fname_inp,
16771675
const char * fname_out,
1678-
int itype) {
1679-
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
1676+
enum llama_ftype ftype) {
1677+
if (!llama_model_quantize_internal(fname_inp, fname_out, ftype)) {
16801678
fprintf(stderr, "%s: failed to quantize\n", __func__);
16811679
return 1;
16821680
}

llama.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,15 @@ extern "C" {
6464
void * progress_callback_user_data;
6565
};
6666

67+
// model file types
68+
enum llama_ftype {
69+
LLAMA_FTYPE_ALL_F32 = 0,
70+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
71+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
72+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
73+
LLAMA_FTYPE_PER_LAYER_IS_Q4_1 = 4, // but tok_embeddings.weight and output.weight are F16
74+
};
75+
6776
LLAMA_API struct llama_context_params llama_context_default_params();
6877

6978
// Various functions for loading a ggml llama model.
@@ -81,7 +90,7 @@ extern "C" {
8190
LLAMA_API int llama_model_quantize(
8291
const char * fname_inp,
8392
const char * fname_out,
84-
int itype);
93+
enum llama_ftype ftype);
8594

8695
// Returns the KV cache that will contain the context for the
8796
// ongoing prediction with the model.

0 commit comments

Comments
 (0)