@@ -3586,141 +3586,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3586
3586
return true ;
3587
3587
}
3588
3588
3589
- bool clip_model_quantize (const char * fname_inp, const char * fname_out, const int itype) {
3590
- assert (itype < GGML_TYPE_COUNT);
3591
- ggml_type type = static_cast <ggml_type>(itype);
3592
-
3593
- auto * ctx_clip = clip_init (fname_inp, clip_context_params{
3594
- /* use_gpu */ false ,
3595
- /* verbosity */ GGML_LOG_LEVEL_ERROR,
3596
- });
3597
-
3598
- const auto & ctx_src = ctx_clip->ctx_gguf .get ();
3599
- const auto & ctx_data = ctx_clip->ctx_data .get ();
3600
-
3601
- auto * ctx_out = gguf_init_empty ();
3602
- gguf_set_kv (ctx_out, ctx_src);
3603
- gguf_set_val_u32 (ctx_out, " general.quantization_version" , GGML_QNT_VERSION);
3604
- gguf_set_val_u32 (ctx_out, " general.file_type" , itype);
3605
-
3606
- auto fout = std::ofstream (fname_out, std::ios::binary);
3607
-
3608
- const int n_tensors = gguf_get_n_tensors (ctx_src);
3609
-
3610
- for (int i = 0 ; i < n_tensors; ++i) {
3611
- const char * name = gguf_get_tensor_name (ctx_src, i);
3612
- ggml_tensor * cur = ggml_get_tensor (ctx_data, name);
3613
- gguf_add_tensor (ctx_out, cur);
3614
- }
3615
-
3616
- const size_t meta_size = gguf_get_meta_size (ctx_out);
3617
- for (size_t i = 0 ; i < meta_size; ++i) {
3618
- fout.put (0 );
3619
- }
3620
-
3621
- // regexes of tensor names to be quantized
3622
- const std::vector<std::string> k_names = {
3623
- " .*weight" ,
3624
- };
3625
-
3626
- std::vector<uint8_t > work (512 );
3627
- std::vector<float > conv_buf (512 );
3628
- size_t total_size_org = 0 ;
3629
- size_t total_size_new = 0 ;
3630
-
3631
- for (int i = 0 ; i < n_tensors; ++i) {
3632
- const std::string name = gguf_get_tensor_name (ctx_src, i);
3633
- ggml_tensor * cur = ggml_get_tensor (ctx_data, name.c_str ());
3634
-
3635
- enum ggml_type new_type;
3636
- void * new_data;
3637
- size_t new_size;
3638
-
3639
- bool quantize = false ;
3640
- for (const auto & s : k_names) {
3641
- if (std::regex_match (name, std::regex (s))) {
3642
- quantize = true ;
3643
- break ;
3644
- }
3645
- }
3646
-
3647
- // quantize only 2D tensors and bigger than block size
3648
- quantize &= (ggml_n_dims (cur) == 2 ) && cur->ne [0 ] > ggml_blck_size (type);
3649
-
3650
- if (quantize) {
3651
- new_type = type;
3652
- if (new_type >= GGML_TYPE_Q2_K && name.find (" embd" ) != std::string::npos) {
3653
- new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
3654
- // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
3655
- }
3656
- const size_t n_elms = ggml_nelements (cur);
3657
- float * f32_data;
3658
-
3659
- switch (cur->type ) {
3660
- case GGML_TYPE_F32:
3661
- f32_data = (float *)cur->data ;
3662
- break ;
3663
- case GGML_TYPE_F16:
3664
- if (conv_buf.size () < n_elms) {
3665
- conv_buf.resize (n_elms);
3666
- }
3667
- for (size_t j = 0 ; j < n_elms; ++j) {
3668
- conv_buf[j] = ggml_fp16_to_fp32 (((ggml_fp16_t *)cur->data )[j]);
3669
- }
3670
- f32_data = (float *)conv_buf.data ();
3671
- break ;
3672
- default :
3673
- LOG_ERR (" %s: Please use an input file in f32 or f16\n " , __func__);
3674
- gguf_free (ctx_out);
3675
- return false ;
3676
- }
3677
-
3678
- if (work.size () < n_elms * 4 ) {
3679
- work.resize (n_elms * 4 );
3680
- }
3681
- new_data = work.data ();
3682
-
3683
- new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , n_elms/cur->ne [0 ], cur->ne [0 ], nullptr );
3684
- } else {
3685
- new_type = cur->type ;
3686
- new_data = cur->data ;
3687
- new_size = ggml_nbytes (cur);
3688
- }
3689
- const size_t orig_size = ggml_nbytes (cur);
3690
- total_size_org += orig_size;
3691
- total_size_new += new_size;
3692
- gguf_set_tensor_type (ctx_out, name.c_str (), new_type);
3693
- GGML_ASSERT (gguf_get_tensor_size (ctx_out, gguf_find_tensor (ctx_out, name.c_str ())) == new_size);
3694
- gguf_set_tensor_data (ctx_out, name.c_str (), new_data);
3695
- fout.write ((const char *)new_data, new_size);
3696
- size_t pad = GGML_PAD (new_size, gguf_get_alignment (ctx_out)) - new_size;
3697
- for (size_t j = 0 ; j < pad; ++j) {
3698
- fout.put (0 );
3699
- }
3700
-
3701
- LOG_INF (" %s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n " , name.c_str (), ggml_n_dims (cur), quantize,
3702
- orig_size / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 );
3703
- }
3704
-
3705
- // go back to beginning of file and write the updated metadata
3706
- fout.seekp (0 , std::ios::beg);
3707
- std::vector<uint8_t > meta (meta_size);
3708
- gguf_get_meta_data (ctx_out, meta.data ());
3709
- fout.write ((const char *)meta.data (), meta_size);
3710
-
3711
- fout.close ();
3712
-
3713
- clip_free (ctx_clip);
3714
- gguf_free (ctx_out);
3715
-
3716
- {
3717
- LOG_INF (" %s: original size = %8.2f MB\n " , __func__, total_size_org / 1024.0 / 1024.0 );
3718
- LOG_INF (" %s: quantized size = %8.2f MB\n " , __func__, total_size_new / 1024.0 / 1024.0 );
3719
- }
3720
-
3721
- return true ;
3722
- }
3723
-
3724
3589
int clip_n_mmproj_embd (const struct clip_ctx * ctx) {
3725
3590
switch (ctx->proj_type ) {
3726
3591
case PROJECTOR_TYPE_LDP:
0 commit comments