Skip to content

Commit e24187b

Browse files
fairydreamingsszymczy
authored andcommitted
gguf-py, convert-hf : model conversion support for T5 and FLAN-T5 model variants (ggml-org#5763)
* gguf-py : add T5 model architecture * gguf-py : add separate tensors for encoder and decoder * gguf-py : add new model header parameters: decoder_start_token_id, attention.relative_buckets_count, tokenizer.ggml.remove_extra_whitespaces, tokenizer.ggml.precompiled_charsmap * convert-hf : add model conversion support for T5ForConditionalGeneration and T5WithLMHeadModel --------- Co-authored-by: Stanisław Szymczyk <[email protected]>
1 parent 64be20c commit e24187b

File tree

4 files changed

+506
-164
lines changed

4 files changed

+506
-164
lines changed

convert-hf-to-gguf.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
8080
if not self.is_safetensors:
8181
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
8282
self.hparams = Model.load_hparams(self.dir_model)
83-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
83+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
8484
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
8585
self.tensor_names = None
8686
if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -2771,6 +2771,124 @@ def write_tensors(self):
27712771
raise ValueError(f"Unprocessed experts: {experts}")
27722772

27732773

2774+
@Model.register("T5ForConditionalGeneration")
2775+
@Model.register("T5WithLMHeadModel")
2776+
class T5Model(Model):
2777+
model_arch = gguf.MODEL_ARCH.T5
2778+
2779+
def set_vocab(self):
2780+
# to avoid TypeError: Descriptors cannot be created directly
2781+
# exception when importing sentencepiece_model_pb2
2782+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
2783+
from sentencepiece import SentencePieceProcessor
2784+
from sentencepiece import sentencepiece_model_pb2 as model
2785+
2786+
tokenizer_path = self.dir_model / 'spiece.model'
2787+
2788+
if not tokenizer_path.is_file():
2789+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
2790+
2791+
sentencepiece_model = model.ModelProto()
2792+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2793+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2794+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
2795+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
2796+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
2797+
2798+
tokenizer = SentencePieceProcessor()
2799+
tokenizer.LoadFromFile(str(tokenizer_path))
2800+
2801+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2802+
2803+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2804+
scores: list[float] = [-10000.0] * vocab_size
2805+
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2806+
2807+
for token_id in range(tokenizer.vocab_size()):
2808+
piece = tokenizer.IdToPiece(token_id)
2809+
text = piece.encode("utf-8")
2810+
score = tokenizer.GetScore(token_id)
2811+
2812+
toktype = SentencePieceTokenTypes.NORMAL
2813+
if tokenizer.IsUnknown(token_id):
2814+
toktype = SentencePieceTokenTypes.UNKNOWN
2815+
elif tokenizer.IsControl(token_id):
2816+
toktype = SentencePieceTokenTypes.CONTROL
2817+
elif tokenizer.IsUnused(token_id):
2818+
toktype = SentencePieceTokenTypes.UNUSED
2819+
elif tokenizer.IsByte(token_id):
2820+
toktype = SentencePieceTokenTypes.BYTE
2821+
2822+
tokens[token_id] = text
2823+
scores[token_id] = score
2824+
toktypes[token_id] = toktype
2825+
2826+
added_tokens_file = self.dir_model / 'added_tokens.json'
2827+
if added_tokens_file.is_file():
2828+
with open(added_tokens_file, "r", encoding="utf-8") as f:
2829+
added_tokens_json = json.load(f)
2830+
for key in added_tokens_json:
2831+
token_id = added_tokens_json[key]
2832+
if (token_id >= vocab_size):
2833+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
2834+
continue
2835+
2836+
tokens[token_id] = key.encode("utf-8")
2837+
scores[token_id] = -1000.0
2838+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2839+
2840+
if vocab_size > len(tokens):
2841+
pad_count = vocab_size - len(tokens)
2842+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
2843+
for i in range(1, pad_count + 1):
2844+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
2845+
scores.append(-1000.0)
2846+
toktypes.append(SentencePieceTokenTypes.UNUSED)
2847+
2848+
self.gguf_writer.add_tokenizer_model("t5")
2849+
self.gguf_writer.add_tokenizer_pre("default")
2850+
self.gguf_writer.add_token_list(tokens)
2851+
self.gguf_writer.add_token_scores(scores)
2852+
self.gguf_writer.add_token_types(toktypes)
2853+
self.gguf_writer.add_add_space_prefix(add_prefix)
2854+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2855+
if precompiled_charsmap:
2856+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
2857+
2858+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2859+
special_vocab.add_to_gguf(self.gguf_writer)
2860+
2861+
self.gguf_writer.add_add_bos_token(False)
2862+
self.gguf_writer.add_add_eos_token(True)
2863+
2864+
def set_gguf_parameters(self):
2865+
self.gguf_writer.add_name("T5")
2866+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
2867+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
2868+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
2869+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
2870+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
2871+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
2872+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
2873+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2874+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
2875+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
2876+
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
2877+
self.gguf_writer.add_file_type(self.ftype)
2878+
2879+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2880+
del bid # unused
2881+
2882+
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
2883+
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
2884+
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
2885+
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
2886+
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
2887+
return []
2888+
2889+
return [(self.map_tensor_name(name), data_torch)]
2890+
2891+
27742892
###### CONVERSION LOGIC ######
27752893

27762894

0 commit comments

Comments
 (0)