36
36
} \
37
37
} while (0 )
38
38
39
+ static const char * ttype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" };
39
40
40
41
// determine number of model parts based on the dimension
41
42
static const std::unordered_map<int , int > LLAMA_N_PARTS = {
@@ -100,7 +101,7 @@ struct llama_hparams {
100
101
int32_t n_head = 32 ;
101
102
int32_t n_layer = 32 ;
102
103
int32_t n_rot = 64 ;
103
- int32_t f16 = 1 ;
104
+ int32_t ftype = LLAMA_FTYPE_MOSTLY_F16 ;
104
105
};
105
106
106
107
struct llama_layer {
@@ -424,7 +425,7 @@ static bool llama_model_load(
424
425
fin.read ((char *) &hparams.n_head , sizeof (hparams.n_head ));
425
426
fin.read ((char *) &hparams.n_layer , sizeof (hparams.n_layer ));
426
427
fin.read ((char *) &hparams.n_rot , sizeof (hparams.n_rot ));
427
- fin.read ((char *) &hparams.f16 , sizeof (hparams.f16 ));
428
+ fin.read ((char *) &hparams.ftype , sizeof (hparams.ftype ));
428
429
429
430
hparams.n_ctx = n_ctx;
430
431
@@ -435,7 +436,7 @@ static bool llama_model_load(
435
436
}
436
437
437
438
// temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1 ) {
439
+ if (hparams.ftype == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1 ) {
439
440
fprintf (stderr, " %s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n " , __func__, n_parts);
440
441
fprintf (stderr, " %s: use '--n_parts 1' if necessary\n " , __func__);
441
442
}
@@ -463,7 +464,7 @@ static bool llama_model_load(
463
464
fprintf (stderr, " %s: n_head = %d\n " , __func__, hparams.n_head );
464
465
fprintf (stderr, " %s: n_layer = %d\n " , __func__, hparams.n_layer );
465
466
fprintf (stderr, " %s: n_rot = %d\n " , __func__, hparams.n_rot );
466
- fprintf (stderr, " %s: f16 = %d\n " , __func__, hparams.f16 );
467
+ fprintf (stderr, " %s: ftype = %d\n " , __func__, hparams.ftype );
467
468
fprintf (stderr, " %s: n_ff = %d\n " , __func__, n_ff);
468
469
fprintf (stderr, " %s: n_parts = %d\n " , __func__, n_parts);
469
470
fprintf (stderr, " %s: type = %d\n " , __func__, model.type );
@@ -507,16 +508,19 @@ static bool llama_model_load(
507
508
// in order to save memory and also to speed up the computation
508
509
// wtype is for per-layer weights, while vtype is for other weights
509
510
ggml_type wtype, vtype;
510
- switch (model.hparams .f16 ) {
511
- case 0 : wtype = vtype = GGML_TYPE_F32; break ;
512
- case 1 : wtype = vtype = GGML_TYPE_F16; break ;
513
- case 2 : wtype = vtype = GGML_TYPE_Q4_0; break ;
514
- case 3 : wtype = vtype = GGML_TYPE_Q4_1; break ;
515
- case 4 : wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break ;
511
+ switch (model.hparams .ftype ) {
512
+ case LLAMA_FTYPE_ALL_F32: wtype = vtype = GGML_TYPE_F32; break ;
513
+ case LLAMA_FTYPE_MOSTLY_F16: wtype = vtype = GGML_TYPE_F16; break ;
514
+ case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break ;
515
+ case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break ;
516
+ case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
517
+ wtype = GGML_TYPE_Q4_1;
518
+ vtype = GGML_TYPE_F16;
519
+ break ;
516
520
default :
517
521
{
518
- fprintf (stderr, " %s: invalid model file '%s' (bad f16 value %d)\n " ,
519
- __func__, fname.c_str (), model.hparams .f16 );
522
+ fprintf (stderr, " %s: invalid model file '%s' (bad ftype value %d)\n " ,
523
+ __func__, fname.c_str (), model.hparams .ftype );
520
524
return false ;
521
525
}
522
526
}
@@ -647,11 +651,11 @@ static bool llama_model_load(
647
651
while (true ) {
648
652
int32_t n_dims;
649
653
int32_t length;
650
- int32_t ftype ;
654
+ int32_t ttype ;
651
655
652
656
fin.read (reinterpret_cast <char *>(&n_dims), sizeof (n_dims));
653
657
fin.read (reinterpret_cast <char *>(&length), sizeof (length));
654
- fin.read (reinterpret_cast <char *>(&ftype ), sizeof (ftype ));
658
+ fin.read (reinterpret_cast <char *>(&ttype ), sizeof (ttype ));
655
659
656
660
if (fin.eof ()) {
657
661
break ;
@@ -684,20 +688,19 @@ static bool llama_model_load(
684
688
return false ;
685
689
}
686
690
if (0 ) {
687
- static const char * ftype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" , };
688
- fprintf (stderr, " %24s - [%5d, %5d], type = %6s\n " , name.data (), ne[0 ], ne[1 ], ftype_str[ftype]);
691
+ fprintf (stderr, " %24s - [%5d, %5d], type = %6s\n " , name.data (), ne[0 ], ne[1 ], ttype_str[ttype]);
689
692
}
690
693
691
- switch (ftype ) {
692
- case 0 : // f32
693
- case 1 : // f16
694
+ switch (ttype ) {
695
+ case GGML_TYPE_F32:
696
+ case GGML_TYPE_F16:
694
697
break ;
695
- case 2 : // q4_0
696
- case 3 : // q4_1
698
+ case GGML_TYPE_Q4_0:
699
+ case GGML_TYPE_Q4_1:
697
700
assert (ne[0 ] % 64 == 0 );
698
701
break ;
699
702
default :
700
- fprintf (stderr, " %s: unknown ftype %d in model file\n " , __func__, ftype );
703
+ fprintf (stderr, " %s: unknown tensor type %d in model file\n " , __func__, ttype );
701
704
return false ;
702
705
};
703
706
@@ -1289,20 +1292,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
1289
1292
//
1290
1293
1291
1294
// TODO: reuse code from the llama_model_load() somehow
1292
- static bool llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, int itype ) {
1293
- ggml_type type = GGML_TYPE_Q4_1 ;
1295
+ static bool llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype ) {
1296
+ ggml_type qtype ;
1294
1297
1295
- switch (itype ) {
1296
- case 2 : type = GGML_TYPE_Q4_0; break ;
1297
- case 3 : type = GGML_TYPE_Q4_1; break ;
1298
- default : fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, itype ); return 1 ;
1298
+ switch (ftype ) {
1299
+ case LLAMA_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break ;
1300
+ case LLAMA_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break ;
1301
+ default : fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, ftype ); return false ;
1299
1302
};
1300
1303
1301
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1302
- fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, type);
1303
- return false ;
1304
- }
1305
-
1306
1304
llama_vocab vocab;
1307
1305
1308
1306
printf (" %s: loading model from '%s'\n " , __func__, fname_inp.c_str ());
@@ -1357,15 +1355,15 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1357
1355
finp.read ((char *) &hparams.n_head , sizeof (hparams.n_head ));
1358
1356
finp.read ((char *) &hparams.n_layer , sizeof (hparams.n_layer ));
1359
1357
finp.read ((char *) &hparams.n_rot , sizeof (hparams.n_rot ));
1360
- finp.read ((char *) &hparams.f16 , sizeof (hparams.f16 ));
1358
+ finp.read ((char *) &hparams.ftype , sizeof (hparams.ftype ));
1361
1359
1362
1360
printf (" %s: n_vocab = %d\n " , __func__, hparams.n_vocab );
1363
1361
printf (" %s: n_ctx = %d\n " , __func__, hparams.n_ctx );
1364
1362
printf (" %s: n_embd = %d\n " , __func__, hparams.n_embd );
1365
1363
printf (" %s: n_mult = %d\n " , __func__, hparams.n_mult );
1366
1364
printf (" %s: n_head = %d\n " , __func__, hparams.n_head );
1367
1365
printf (" %s: n_layer = %d\n " , __func__, hparams.n_layer );
1368
- printf (" %s: f16 = %d\n " , __func__, hparams.f16 );
1366
+ printf (" %s: ftype = %d\n " , __func__, hparams.ftype );
1369
1367
1370
1368
fout.write ((char *) &hparams.n_vocab , sizeof (hparams.n_vocab ));
1371
1369
// fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@@ -1374,7 +1372,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1374
1372
fout.write ((char *) &hparams.n_head , sizeof (hparams.n_head ));
1375
1373
fout.write ((char *) &hparams.n_layer , sizeof (hparams.n_layer ));
1376
1374
fout.write ((char *) &hparams.n_rot , sizeof (hparams.n_rot ));
1377
- fout.write ((char *) &itype, sizeof (hparams.f16 ));
1375
+ int32_t iftype = ftype;
1376
+ fout.write ((char *) &iftype, sizeof (hparams.ftype ));
1378
1377
}
1379
1378
1380
1379
// load vocab
@@ -1426,11 +1425,11 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1426
1425
while (true ) {
1427
1426
int32_t n_dims;
1428
1427
int32_t length;
1429
- int32_t ftype ;
1428
+ int32_t ttype ;
1430
1429
1431
1430
finp.read (reinterpret_cast <char *>(&n_dims), sizeof (n_dims));
1432
1431
finp.read (reinterpret_cast <char *>(&length), sizeof (length));
1433
- finp.read (reinterpret_cast <char *>(&ftype ), sizeof (ftype ));
1432
+ finp.read (reinterpret_cast <char *>(&ttype ), sizeof (ttype ));
1434
1433
1435
1434
if (finp.eof ()) {
1436
1435
break ;
@@ -1454,8 +1453,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1454
1453
}
1455
1454
1456
1455
{
1457
- static const char * ftype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" , };
1458
- printf (" %48s - [%5d, %5d], type = %6s " , name.data (), ne[0 ], ne[1 ], ftype_str[ftype]);
1456
+ printf (" %48s - [%5d, %5d], type = %6s " , name.data (), ne[0 ], ne[1 ], ttype_str[ttype]);
1459
1457
}
1460
1458
1461
1459
// regexes of tensor names to be quantized
@@ -1475,12 +1473,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1475
1473
quantize &= (n_dims == 2 );
1476
1474
1477
1475
if (quantize) {
1478
- if (ftype != 0 && ftype != 1 ) {
1479
- fprintf (stderr, " %s: unsupported ftype %d for integer quantization\n " , __func__, ftype );
1476
+ if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16 ) {
1477
+ fprintf (stderr, " %s: unsupported tensor type %d for integer quantization\n " , __func__, ttype );
1480
1478
return false ;
1481
1479
}
1482
1480
1483
- if (ftype == 1 ) {
1481
+ if (ttype == GGML_TYPE_F16 ) {
1484
1482
data_f16.resize (nelements);
1485
1483
finp.read (reinterpret_cast <char *>(data_f16.data ()), nelements * sizeof (ggml_fp16_t ));
1486
1484
data_f32.resize (nelements);
@@ -1492,17 +1490,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1492
1490
finp.read (reinterpret_cast <char *>(data_f32.data ()), nelements * sizeof (float ));
1493
1491
}
1494
1492
1495
- ftype = itype ;
1493
+ ttype = qtype ;
1496
1494
} else {
1497
- const int bpe = (ftype == 0 ) ? sizeof ( float ) : sizeof ( uint16_t );
1495
+ const int bpe = ggml_type_size ((ggml_type)ttype );
1498
1496
1499
1497
data_u8.resize (nelements*bpe);
1500
1498
finp.read (reinterpret_cast <char *>(data_u8.data ()), nelements * bpe);
1501
1499
}
1502
1500
1503
1501
fout.write (reinterpret_cast <char *>(&n_dims), sizeof (n_dims));
1504
1502
fout.write (reinterpret_cast <char *>(&length), sizeof (length));
1505
- fout.write (reinterpret_cast <char *>(&ftype ), sizeof (ftype ));
1503
+ fout.write (reinterpret_cast <char *>(&ttype ), sizeof (ttype ));
1506
1504
for (int i = 0 ; i < n_dims; ++i) {
1507
1505
fout.write (reinterpret_cast <char *>(&ne[i]), sizeof (ne[i]));
1508
1506
}
@@ -1522,7 +1520,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1522
1520
size_t cur_size = 0 ;
1523
1521
std::vector<int64_t > hist_cur (1 << 4 , 0 );
1524
1522
1525
- switch (type ) {
1523
+ switch (qtype ) {
1526
1524
case GGML_TYPE_Q4_0:
1527
1525
{
1528
1526
cur_size = ggml_quantize_q4_0 (data_f32.data (), work.data (), nelements, ne[0 ], hist_cur.data ());
@@ -1533,7 +1531,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1533
1531
} break ;
1534
1532
default :
1535
1533
{
1536
- fprintf (stderr, " %s: unsupported quantization type %d\n " , __func__, type );
1534
+ fprintf (stderr, " %s: unsupported quantization type %d\n " , __func__, qtype );
1537
1535
return false ;
1538
1536
}
1539
1537
}
@@ -1675,8 +1673,8 @@ void llama_free(struct llama_context * ctx) {
1675
1673
int llama_model_quantize (
1676
1674
const char * fname_inp,
1677
1675
const char * fname_out,
1678
- int itype ) {
1679
- if (!llama_model_quantize_internal (fname_inp, fname_out, itype )) {
1676
+ enum llama_ftype ftype ) {
1677
+ if (!llama_model_quantize_internal (fname_inp, fname_out, ftype )) {
1680
1678
fprintf (stderr, " %s: failed to quantize\n " , __func__);
1681
1679
return 1 ;
1682
1680
}
0 commit comments