@@ -77,7 +77,7 @@ struct llama_hparams {
77
77
uint32_t n_head = 32 ;
78
78
uint32_t n_layer = 32 ;
79
79
uint32_t n_rot = 64 ;
80
- uint32_t f16 = 1 ;
80
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16 ;
81
81
82
82
bool operator !=(const llama_hparams & other) const {
83
83
return memcmp (this , &other, sizeof (llama_hparams));
@@ -427,7 +427,7 @@ struct llama_file_loader {
427
427
hparams.n_head = file.read_u32 ();
428
428
hparams.n_layer = file.read_u32 ();
429
429
hparams.n_rot = file.read_u32 ();
430
- hparams.f16 = file.read_u32 ();
430
+ hparams.ftype = ( enum llama_ftype) file.read_u32 ();
431
431
}
432
432
void read_vocab () {
433
433
vocab.id_to_token .resize (hparams.n_vocab );
@@ -453,20 +453,21 @@ struct llama_file_loader {
453
453
llama_load_tensor_shard shard;
454
454
uint32_t n_dims = file.read_u32 ();
455
455
uint32_t name_len = file.read_u32 ();
456
- uint32_t ftype = file.read_u32 ();
456
+ shard. type = ( enum ggml_type) file.read_u32 ();
457
457
shard.ne .resize (n_dims);
458
458
file.read_raw (shard.ne .data (), sizeof (shard.ne [0 ]) * n_dims);
459
459
std::string name = file.read_string (name_len);
460
460
if (n_dims < 1 || n_dims > 2 ) {
461
461
throw format (" llama.cpp: tensor '%s' should not be %u-dimensional" , name.c_str (), n_dims);
462
462
}
463
- switch (ftype) {
464
- case 0 : shard.type = GGML_TYPE_F32; break ;
465
- case 1 : shard.type = GGML_TYPE_F16; break ;
466
- case 2 : shard.type = GGML_TYPE_Q4_0; break ;
467
- case 3 : shard.type = GGML_TYPE_Q4_1; break ;
463
+ switch (shard.type ) {
464
+ case GGML_TYPE_F32:
465
+ case GGML_TYPE_F16:
466
+ case GGML_TYPE_Q4_0:
467
+ case GGML_TYPE_Q4_1:
468
+ break ;
468
469
default : {
469
- throw format (" unrecognized ftype %u\n " , ftype );
470
+ throw format (" unrecognized tensor type %u\n " , shard. type );
470
471
}
471
472
}
472
473
@@ -497,26 +498,26 @@ struct llama_file_loader {
497
498
struct llama_file_saver {
498
499
llama_file file;
499
500
llama_file_loader * any_file_loader;
500
- llama_file_saver (const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16 )
501
+ llama_file_saver (const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype )
501
502
: file(fname, " wb" ), any_file_loader(any_file_loader) {
502
503
fprintf (stderr, " llama.cpp: saving model to %s\n " , fname);
503
504
write_magic ();
504
- write_hparams (new_f16 );
505
+ write_hparams (new_ftype );
505
506
write_vocab ();
506
507
}
507
508
void write_magic () {
508
509
file.write_u32 (' ggjt' ); // magic
509
510
file.write_u32 (1 ); // version
510
511
}
511
- void write_hparams (uint32_t new_f16 ) {
512
+ void write_hparams (enum llama_ftype new_ftype ) {
512
513
const llama_hparams & hparams = any_file_loader->hparams ;
513
514
file.write_u32 (hparams.n_vocab );
514
515
file.write_u32 (hparams.n_embd );
515
516
file.write_u32 (hparams.n_mult );
516
517
file.write_u32 (hparams.n_head );
517
518
file.write_u32 (hparams.n_layer );
518
519
file.write_u32 (hparams.n_rot );
519
- file.write_u32 (new_f16 );
520
+ file.write_u32 (new_ftype );
520
521
}
521
522
void write_vocab () {
522
523
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
@@ -531,17 +532,17 @@ struct llama_file_saver {
531
532
}
532
533
}
533
534
void write_tensor (llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
534
- uint32_t ftype;
535
535
switch (new_type) {
536
- case GGML_TYPE_F32: ftype = 0 ; break ;
537
- case GGML_TYPE_F16: ftype = 1 ; break ;
538
- case GGML_TYPE_Q4_0: ftype = 2 ; break ;
539
- case GGML_TYPE_Q4_1: ftype = 3 ; break ;
536
+ case GGML_TYPE_F32:
537
+ case GGML_TYPE_F16:
538
+ case GGML_TYPE_Q4_0:
539
+ case GGML_TYPE_Q4_1:
540
+ break ;
540
541
default : LLAMA_ASSERT (false );
541
542
}
542
543
file.write_u32 ((uint32_t ) tensor.ne .size ());
543
544
file.write_u32 ((uint32_t ) tensor.name .size ());
544
- file.write_u32 (ftype );
545
+ file.write_u32 (new_type );
545
546
file.write_raw (tensor.ne .data (), sizeof (tensor.ne [0 ]) * tensor.ne .size ());
546
547
file.write_raw (tensor.name .data (), tensor.name .size ());
547
548
file.seek (-file.tell () & 31 , SEEK_CUR);
@@ -815,6 +816,16 @@ static const char *llama_file_version_name(llama_file_version version) {
815
816
}
816
817
}
817
818
819
+ static const char *llama_ftype_name (enum llama_ftype ftype) {
820
+ switch (ftype) {
821
+ case LLAMA_FTYPE_ALL_F32: return " all F32" ;
822
+ case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
823
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return " mostly Q4_0" ;
824
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return " mostly Q4_1" ;
825
+ default : LLAMA_ASSERT (false );
826
+ }
827
+ }
828
+
818
829
static const char *llama_model_type_name (e_model type) {
819
830
switch (type) {
820
831
case MODEL_7B: return " 7B" ;
@@ -867,7 +878,7 @@ static void llama_model_load_internal(
867
878
fprintf (stderr, " %s: n_head = %u\n " , __func__, hparams.n_head );
868
879
fprintf (stderr, " %s: n_layer = %u\n " , __func__, hparams.n_layer );
869
880
fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
870
- fprintf (stderr, " %s: f16 = %u\n " , __func__, hparams.f16 );
881
+ fprintf (stderr, " %s: ftype = %u (%s) \n " , __func__, hparams.ftype , llama_ftype_name (hparams. ftype ) );
871
882
fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
872
883
fprintf (stderr, " %s: n_parts = %zu\n " , __func__, ml->file_loaders .size ());
873
884
fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
@@ -1539,17 +1550,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
1539
1550
// quantization
1540
1551
//
1541
1552
1542
- static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, int itype ) {
1553
+ static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype ) {
1543
1554
ggml_type quantized_type;
1544
- switch (itype ) {
1545
- case 2 : quantized_type = GGML_TYPE_Q4_0; break ;
1546
- case 3 : quantized_type = GGML_TYPE_Q4_1; break ;
1547
- default : throw format (" invalid quantization type %d\n " , itype );
1555
+ switch (ftype ) {
1556
+ case LLAMA_FTYPE_MOSTLY_Q4_0 : quantized_type = GGML_TYPE_Q4_0; break ;
1557
+ case LLAMA_FTYPE_MOSTLY_Q4_1 : quantized_type = GGML_TYPE_Q4_1; break ;
1558
+ default : throw format (" invalid output file type %d\n " , ftype );
1548
1559
};
1549
1560
1550
1561
std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp.c_str (), /* use_mmap*/ false ,
1551
1562
/* vocab_only*/ false ));
1552
- llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), ( uint32_t ) itype );
1563
+ llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), ftype );
1553
1564
1554
1565
size_t total_size_org = 0 ;
1555
1566
size_t total_size_new = 0 ;
@@ -1740,9 +1751,9 @@ void llama_free(struct llama_context * ctx) {
1740
1751
int llama_model_quantize (
1741
1752
const char * fname_inp,
1742
1753
const char * fname_out,
1743
- int itype ) {
1754
+ enum llama_ftype ftype ) {
1744
1755
try {
1745
- llama_model_quantize_internal (fname_inp, fname_out, itype );
1756
+ llama_model_quantize_internal (fname_inp, fname_out, ftype );
1746
1757
return 0 ;
1747
1758
} catch (const std::string & err) {
1748
1759
fprintf (stderr, " %s: failed to quantize: %s\n " , __func__, err.c_str ());
0 commit comments