Skip to content

Commit e07039b

Browse files
committed
fix tokenizer
1 parent 8e2566a commit e07039b

File tree

1 file changed

+112
-88
lines changed

1 file changed

+112
-88
lines changed

convert_hf_to_gguf.py

Lines changed: 112 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -3365,6 +3365,97 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33653365

33663366
return [(self.map_tensor_name(name), data_torch)]
33673367

3368+
def _xlmroberta_tokenizer_init(self) -> None:
3369+
# we need the pad_token_id to know how to chop down position_embd matrix
3370+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3371+
self._position_offset = 1 + pad_token_id
3372+
if "max_position_embeddings" in self.hparams:
3373+
self.hparams["max_position_embeddings"] -= self._position_offset
3374+
else:
3375+
self._position_offset = None
3376+
3377+
def _xlmroberta_set_vocab(self) -> None:
3378+
# to avoid TypeError: Descriptors cannot be created directly
3379+
# exception when importing sentencepiece_model_pb2
3380+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3381+
from sentencepiece import SentencePieceProcessor
3382+
from sentencepiece import sentencepiece_model_pb2 as model
3383+
3384+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3385+
if not tokenizer_path.is_file():
3386+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
3387+
3388+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3389+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3390+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3391+
3392+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3393+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3394+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3395+
3396+
tokenizer = SentencePieceProcessor()
3397+
tokenizer.LoadFromFile(str(tokenizer_path))
3398+
3399+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3400+
3401+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3402+
scores: list[float] = [-10000.0] * vocab_size
3403+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3404+
3405+
for token_id in range(tokenizer.vocab_size()):
3406+
piece = tokenizer.IdToPiece(token_id)
3407+
text = piece.encode("utf-8")
3408+
score = tokenizer.GetScore(token_id)
3409+
3410+
toktype = SentencePieceTokenTypes.NORMAL
3411+
if tokenizer.IsUnknown(token_id):
3412+
toktype = SentencePieceTokenTypes.UNKNOWN
3413+
elif tokenizer.IsControl(token_id):
3414+
toktype = SentencePieceTokenTypes.CONTROL
3415+
elif tokenizer.IsUnused(token_id):
3416+
toktype = SentencePieceTokenTypes.UNUSED
3417+
elif tokenizer.IsByte(token_id):
3418+
toktype = SentencePieceTokenTypes.BYTE
3419+
3420+
tokens[token_id] = text
3421+
scores[token_id] = score
3422+
toktypes[token_id] = toktype
3423+
3424+
if vocab_size > len(tokens):
3425+
pad_count = vocab_size - len(tokens)
3426+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3427+
for i in range(1, pad_count + 1):
3428+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3429+
scores.append(-1000.0)
3430+
toktypes.append(SentencePieceTokenTypes.UNUSED)
3431+
3432+
# realign tokens (see HF tokenizer code)
3433+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3434+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3435+
toktypes = [
3436+
SentencePieceTokenTypes.CONTROL,
3437+
SentencePieceTokenTypes.CONTROL,
3438+
SentencePieceTokenTypes.CONTROL,
3439+
SentencePieceTokenTypes.UNKNOWN,
3440+
] + toktypes[3:-1]
3441+
3442+
self.gguf_writer.add_tokenizer_model("t5")
3443+
self.gguf_writer.add_tokenizer_pre("default")
3444+
self.gguf_writer.add_token_list(tokens)
3445+
self.gguf_writer.add_token_scores(scores)
3446+
self.gguf_writer.add_token_types(toktypes)
3447+
self.gguf_writer.add_add_space_prefix(add_prefix)
3448+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3449+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3450+
if precompiled_charsmap:
3451+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3452+
3453+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3454+
special_vocab.add_to_gguf(self.gguf_writer)
3455+
3456+
self.gguf_writer.add_add_bos_token(True)
3457+
self.gguf_writer.add_add_eos_token(True)
3458+
33683459

33693460
@ModelBase.register("RobertaModel")
33703461
class RobertaModel(BertModel):
@@ -3423,6 +3514,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
34233514

34243515
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
34253516

3517+
self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
3518+
if self._tokenizer_is_xlmroberta:
3519+
self._xlmroberta_tokenizer_init()
3520+
34263521
# the HF config claims n_ctx=8192, but it uses RoPE scaling
34273522
self.hparams["n_ctx"] = 2048
34283523

@@ -3442,6 +3537,11 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
34423537
assert self.hparams["rotary_emb_interleaved"] is False
34433538
assert self.hparams["rotary_emb_scale_base"] is None
34443539

3540+
def set_vocab(self) -> None:
3541+
if self._tokenizer_is_xlmroberta:
3542+
return self._xlmroberta_set_vocab()
3543+
return super().set_vocab()
3544+
34453545
def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
34463546
# If the tensor is an experts bias tensor, skip it by returning an empty list.
34473547
if "mlp.experts.bias" in name:
@@ -3466,103 +3566,27 @@ def set_gguf_parameters(self):
34663566
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
34673567
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
34683568

3569+
def _is_tokenizer_xlmroberta(self) -> bool:
3570+
with open(self.dir_model / "tokenizer.json") as f:
3571+
tokenizer_json = json.load(f)
3572+
toktyp = tokenizer_json["model"]["type"]
3573+
if toktyp == "Unigram":
3574+
return True
3575+
if toktyp == "WordPiece":
3576+
return False
3577+
raise ValueError(f"unknown tokenizer: {toktyp}")
3578+
34693579

34703580
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
34713581
class XLMRobertaModel(BertModel):
34723582
model_arch = gguf.MODEL_ARCH.BERT
34733583

34743584
def __init__(self, *args, **kwargs):
34753585
super().__init__(*args, **kwargs)
3476-
3477-
# we need the pad_token_id to know how to chop down position_embd matrix
3478-
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3479-
self._position_offset = 1 + pad_token_id
3480-
if "max_position_embeddings" in self.hparams:
3481-
self.hparams["max_position_embeddings"] -= self._position_offset
3482-
else:
3483-
self._position_offset = None
3586+
self._xlmroberta_tokenizer_init()
34843587

34853588
def set_vocab(self):
3486-
# to avoid TypeError: Descriptors cannot be created directly
3487-
# exception when importing sentencepiece_model_pb2
3488-
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3489-
from sentencepiece import SentencePieceProcessor
3490-
from sentencepiece import sentencepiece_model_pb2 as model
3491-
3492-
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3493-
if not tokenizer_path.is_file():
3494-
raise FileNotFoundError(f"File not found: {tokenizer_path}")
3495-
3496-
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3497-
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3498-
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3499-
3500-
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3501-
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3502-
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3503-
3504-
tokenizer = SentencePieceProcessor()
3505-
tokenizer.LoadFromFile(str(tokenizer_path))
3506-
3507-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3508-
3509-
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3510-
scores: list[float] = [-10000.0] * vocab_size
3511-
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3512-
3513-
for token_id in range(tokenizer.vocab_size()):
3514-
piece = tokenizer.IdToPiece(token_id)
3515-
text = piece.encode("utf-8")
3516-
score = tokenizer.GetScore(token_id)
3517-
3518-
toktype = SentencePieceTokenTypes.NORMAL
3519-
if tokenizer.IsUnknown(token_id):
3520-
toktype = SentencePieceTokenTypes.UNKNOWN
3521-
elif tokenizer.IsControl(token_id):
3522-
toktype = SentencePieceTokenTypes.CONTROL
3523-
elif tokenizer.IsUnused(token_id):
3524-
toktype = SentencePieceTokenTypes.UNUSED
3525-
elif tokenizer.IsByte(token_id):
3526-
toktype = SentencePieceTokenTypes.BYTE
3527-
3528-
tokens[token_id] = text
3529-
scores[token_id] = score
3530-
toktypes[token_id] = toktype
3531-
3532-
if vocab_size > len(tokens):
3533-
pad_count = vocab_size - len(tokens)
3534-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3535-
for i in range(1, pad_count + 1):
3536-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3537-
scores.append(-1000.0)
3538-
toktypes.append(SentencePieceTokenTypes.UNUSED)
3539-
3540-
# realign tokens (see HF tokenizer code)
3541-
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3542-
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3543-
toktypes = [
3544-
SentencePieceTokenTypes.CONTROL,
3545-
SentencePieceTokenTypes.CONTROL,
3546-
SentencePieceTokenTypes.CONTROL,
3547-
SentencePieceTokenTypes.UNKNOWN,
3548-
] + toktypes[3:-1]
3549-
3550-
self.gguf_writer.add_tokenizer_model("t5")
3551-
self.gguf_writer.add_tokenizer_pre("default")
3552-
self.gguf_writer.add_token_list(tokens)
3553-
self.gguf_writer.add_token_scores(scores)
3554-
self.gguf_writer.add_token_types(toktypes)
3555-
self.gguf_writer.add_add_space_prefix(add_prefix)
3556-
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3557-
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3558-
if precompiled_charsmap:
3559-
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3560-
3561-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3562-
special_vocab.add_to_gguf(self.gguf_writer)
3563-
3564-
self.gguf_writer.add_add_bos_token(True)
3565-
self.gguf_writer.add_add_eos_token(True)
3589+
self._xlmroberta_set_vocab()
35663590

35673591
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
35683592
# if name starts with "roberta.", remove the prefix

0 commit comments

Comments
 (0)