@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
80
80
if not self .is_safetensors :
81
81
self .part_names = Model .get_model_part_names (self .dir_model , "pytorch_model" , ".bin" )
82
82
self .hparams = Model .load_hparams (self .dir_model )
83
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
83
+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
84
84
self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
85
85
self .tensor_names = None
86
86
if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2771,6 +2771,124 @@ def write_tensors(self):
2771
2771
raise ValueError (f"Unprocessed experts: { experts } " )
2772
2772
2773
2773
2774
+ @Model .register ("T5ForConditionalGeneration" )
2775
+ @Model .register ("T5WithLMHeadModel" )
2776
+ class T5Model (Model ):
2777
+ model_arch = gguf .MODEL_ARCH .T5
2778
+
2779
+ def set_vocab (self ):
2780
+ # to avoid TypeError: Descriptors cannot be created directly
2781
+ # exception when importing sentencepiece_model_pb2
2782
+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
2783
+ from sentencepiece import SentencePieceProcessor
2784
+ from sentencepiece import sentencepiece_model_pb2 as model
2785
+
2786
+ tokenizer_path = self .dir_model / 'spiece.model'
2787
+
2788
+ if not tokenizer_path .is_file ():
2789
+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
2790
+
2791
+ sentencepiece_model = model .ModelProto ()
2792
+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
2793
+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
2794
+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
2795
+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
2796
+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
2797
+
2798
+ tokenizer = SentencePieceProcessor ()
2799
+ tokenizer .LoadFromFile (str (tokenizer_path ))
2800
+
2801
+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
2802
+
2803
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2804
+ scores : list [float ] = [- 10000.0 ] * vocab_size
2805
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2806
+
2807
+ for token_id in range (tokenizer .vocab_size ()):
2808
+ piece = tokenizer .IdToPiece (token_id )
2809
+ text = piece .encode ("utf-8" )
2810
+ score = tokenizer .GetScore (token_id )
2811
+
2812
+ toktype = SentencePieceTokenTypes .NORMAL
2813
+ if tokenizer .IsUnknown (token_id ):
2814
+ toktype = SentencePieceTokenTypes .UNKNOWN
2815
+ elif tokenizer .IsControl (token_id ):
2816
+ toktype = SentencePieceTokenTypes .CONTROL
2817
+ elif tokenizer .IsUnused (token_id ):
2818
+ toktype = SentencePieceTokenTypes .UNUSED
2819
+ elif tokenizer .IsByte (token_id ):
2820
+ toktype = SentencePieceTokenTypes .BYTE
2821
+
2822
+ tokens [token_id ] = text
2823
+ scores [token_id ] = score
2824
+ toktypes [token_id ] = toktype
2825
+
2826
+ added_tokens_file = self .dir_model / 'added_tokens.json'
2827
+ if added_tokens_file .is_file ():
2828
+ with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
2829
+ added_tokens_json = json .load (f )
2830
+ for key in added_tokens_json :
2831
+ token_id = added_tokens_json [key ]
2832
+ if (token_id >= vocab_size ):
2833
+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
2834
+ continue
2835
+
2836
+ tokens [token_id ] = key .encode ("utf-8" )
2837
+ scores [token_id ] = - 1000.0
2838
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2839
+
2840
+ if vocab_size > len (tokens ):
2841
+ pad_count = vocab_size - len (tokens )
2842
+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
2843
+ for i in range (1 , pad_count + 1 ):
2844
+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
2845
+ scores .append (- 1000.0 )
2846
+ toktypes .append (SentencePieceTokenTypes .UNUSED )
2847
+
2848
+ self .gguf_writer .add_tokenizer_model ("t5" )
2849
+ self .gguf_writer .add_tokenizer_pre ("default" )
2850
+ self .gguf_writer .add_token_list (tokens )
2851
+ self .gguf_writer .add_token_scores (scores )
2852
+ self .gguf_writer .add_token_types (toktypes )
2853
+ self .gguf_writer .add_add_space_prefix (add_prefix )
2854
+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
2855
+ if precompiled_charsmap :
2856
+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
2857
+
2858
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2859
+ special_vocab .add_to_gguf (self .gguf_writer )
2860
+
2861
+ self .gguf_writer .add_add_bos_token (False )
2862
+ self .gguf_writer .add_add_eos_token (True )
2863
+
2864
+ def set_gguf_parameters (self ):
2865
+ self .gguf_writer .add_name ("T5" )
2866
+ self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2867
+ self .gguf_writer .add_embedding_length (self .hparams ["d_model" ])
2868
+ self .gguf_writer .add_feed_forward_length (self .hparams ["d_ff" ])
2869
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2870
+ self .gguf_writer .add_head_count (self .hparams ["num_heads" ])
2871
+ self .gguf_writer .add_key_length (self .hparams ["d_kv" ])
2872
+ self .gguf_writer .add_value_length (self .hparams ["d_kv" ])
2873
+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2874
+ self .gguf_writer .add_relative_attn_buckets_count (self .hparams ["relative_attention_num_buckets" ])
2875
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
2876
+ self .gguf_writer .add_decoder_start_token_id (self .hparams ["decoder_start_token_id" ])
2877
+ self .gguf_writer .add_file_type (self .ftype )
2878
+
2879
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2880
+ del bid # unused
2881
+
2882
+ # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
2883
+ # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
2884
+ # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
2885
+ if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight" :
2886
+ logger .debug (f"Skipping tensor { name !r} in safetensors so that convert can end normally." )
2887
+ return []
2888
+
2889
+ return [(self .map_tensor_name (name ), data_torch )]
2890
+
2891
+
2774
2892
###### CONVERSION LOGIC ######
2775
2893
2776
2894
0 commit comments