@@ -3365,6 +3365,97 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
3365
3365
3366
3366
return [(self .map_tensor_name (name ), data_torch )]
3367
3367
3368
+ def _xlmroberta_tokenizer_init (self ) -> None :
3369
+ # we need the pad_token_id to know how to chop down position_embd matrix
3370
+ if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
3371
+ self ._position_offset = 1 + pad_token_id
3372
+ if "max_position_embeddings" in self .hparams :
3373
+ self .hparams ["max_position_embeddings" ] -= self ._position_offset
3374
+ else :
3375
+ self ._position_offset = None
3376
+
3377
+ def _xlmroberta_set_vocab (self ) -> None :
3378
+ # to avoid TypeError: Descriptors cannot be created directly
3379
+ # exception when importing sentencepiece_model_pb2
3380
+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3381
+ from sentencepiece import SentencePieceProcessor
3382
+ from sentencepiece import sentencepiece_model_pb2 as model
3383
+
3384
+ tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3385
+ if not tokenizer_path .is_file ():
3386
+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3387
+
3388
+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3389
+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3390
+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3391
+
3392
+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3393
+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3394
+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3395
+
3396
+ tokenizer = SentencePieceProcessor ()
3397
+ tokenizer .LoadFromFile (str (tokenizer_path ))
3398
+
3399
+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3400
+
3401
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3402
+ scores : list [float ] = [- 10000.0 ] * vocab_size
3403
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3404
+
3405
+ for token_id in range (tokenizer .vocab_size ()):
3406
+ piece = tokenizer .IdToPiece (token_id )
3407
+ text = piece .encode ("utf-8" )
3408
+ score = tokenizer .GetScore (token_id )
3409
+
3410
+ toktype = SentencePieceTokenTypes .NORMAL
3411
+ if tokenizer .IsUnknown (token_id ):
3412
+ toktype = SentencePieceTokenTypes .UNKNOWN
3413
+ elif tokenizer .IsControl (token_id ):
3414
+ toktype = SentencePieceTokenTypes .CONTROL
3415
+ elif tokenizer .IsUnused (token_id ):
3416
+ toktype = SentencePieceTokenTypes .UNUSED
3417
+ elif tokenizer .IsByte (token_id ):
3418
+ toktype = SentencePieceTokenTypes .BYTE
3419
+
3420
+ tokens [token_id ] = text
3421
+ scores [token_id ] = score
3422
+ toktypes [token_id ] = toktype
3423
+
3424
+ if vocab_size > len (tokens ):
3425
+ pad_count = vocab_size - len (tokens )
3426
+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3427
+ for i in range (1 , pad_count + 1 ):
3428
+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3429
+ scores .append (- 1000.0 )
3430
+ toktypes .append (SentencePieceTokenTypes .UNUSED )
3431
+
3432
+ # realign tokens (see HF tokenizer code)
3433
+ tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3434
+ scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3435
+ toktypes = [
3436
+ SentencePieceTokenTypes .CONTROL ,
3437
+ SentencePieceTokenTypes .CONTROL ,
3438
+ SentencePieceTokenTypes .CONTROL ,
3439
+ SentencePieceTokenTypes .UNKNOWN ,
3440
+ ] + toktypes [3 :- 1 ]
3441
+
3442
+ self .gguf_writer .add_tokenizer_model ("t5" )
3443
+ self .gguf_writer .add_tokenizer_pre ("default" )
3444
+ self .gguf_writer .add_token_list (tokens )
3445
+ self .gguf_writer .add_token_scores (scores )
3446
+ self .gguf_writer .add_token_types (toktypes )
3447
+ self .gguf_writer .add_add_space_prefix (add_prefix )
3448
+ self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
3449
+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3450
+ if precompiled_charsmap :
3451
+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3452
+
3453
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3454
+ special_vocab .add_to_gguf (self .gguf_writer )
3455
+
3456
+ self .gguf_writer .add_add_bos_token (True )
3457
+ self .gguf_writer .add_add_eos_token (True )
3458
+
3368
3459
3369
3460
@ModelBase .register ("RobertaModel" )
3370
3461
class RobertaModel (BertModel ):
@@ -3423,6 +3514,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
3423
3514
3424
3515
super ().__init__ (dir_model , ftype , fname_out , hparams = hparams , ** kwargs )
3425
3516
3517
+ self ._tokenizer_is_xlmroberta = self ._is_tokenizer_xlmroberta ()
3518
+ if self ._tokenizer_is_xlmroberta :
3519
+ self ._xlmroberta_tokenizer_init ()
3520
+
3426
3521
# the HF config claims n_ctx=8192, but it uses RoPE scaling
3427
3522
self .hparams ["n_ctx" ] = 2048
3428
3523
@@ -3442,6 +3537,11 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
3442
3537
assert self .hparams ["rotary_emb_interleaved" ] is False
3443
3538
assert self .hparams ["rotary_emb_scale_base" ] is None
3444
3539
3540
+ def set_vocab (self ) -> None :
3541
+ if self ._tokenizer_is_xlmroberta :
3542
+ return self ._xlmroberta_set_vocab ()
3543
+ return super ().set_vocab ()
3544
+
3445
3545
def modify_tensors (self , data_torch : torch .Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , torch .Tensor ]]:
3446
3546
# If the tensor is an experts bias tensor, skip it by returning an empty list.
3447
3547
if "mlp.experts.bias" in name :
@@ -3466,103 +3566,27 @@ def set_gguf_parameters(self):
3466
3566
self .gguf_writer .add_expert_count (self .hparams ["num_experts" ])
3467
3567
self .gguf_writer .add_expert_used_count (self .hparams ["moe_top_k" ])
3468
3568
3569
+ def _is_tokenizer_xlmroberta (self ) -> bool :
3570
+ with open (self .dir_model / "tokenizer.json" ) as f :
3571
+ tokenizer_json = json .load (f )
3572
+ toktyp = tokenizer_json ["model" ]["type" ]
3573
+ if toktyp == "Unigram" :
3574
+ return True
3575
+ if toktyp == "WordPiece" :
3576
+ return False
3577
+ raise ValueError (f"unknown tokenizer: { toktyp } " )
3578
+
3469
3579
3470
3580
@ModelBase .register ("XLMRobertaModel" , "XLMRobertaForSequenceClassification" )
3471
3581
class XLMRobertaModel (BertModel ):
3472
3582
model_arch = gguf .MODEL_ARCH .BERT
3473
3583
3474
3584
def __init__ (self , * args , ** kwargs ):
3475
3585
super ().__init__ (* args , ** kwargs )
3476
-
3477
- # we need the pad_token_id to know how to chop down position_embd matrix
3478
- if (pad_token_id := self .hparams .get ("pad_token_id" )) is not None :
3479
- self ._position_offset = 1 + pad_token_id
3480
- if "max_position_embeddings" in self .hparams :
3481
- self .hparams ["max_position_embeddings" ] -= self ._position_offset
3482
- else :
3483
- self ._position_offset = None
3586
+ self ._xlmroberta_tokenizer_init ()
3484
3587
3485
3588
def set_vocab (self ):
3486
- # to avoid TypeError: Descriptors cannot be created directly
3487
- # exception when importing sentencepiece_model_pb2
3488
- os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3489
- from sentencepiece import SentencePieceProcessor
3490
- from sentencepiece import sentencepiece_model_pb2 as model
3491
-
3492
- tokenizer_path = self .dir_model / 'sentencepiece.bpe.model'
3493
- if not tokenizer_path .is_file ():
3494
- raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3495
-
3496
- sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3497
- sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3498
- assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3499
-
3500
- add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3501
- remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3502
- precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3503
-
3504
- tokenizer = SentencePieceProcessor ()
3505
- tokenizer .LoadFromFile (str (tokenizer_path ))
3506
-
3507
- vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3508
-
3509
- tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3510
- scores : list [float ] = [- 10000.0 ] * vocab_size
3511
- toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3512
-
3513
- for token_id in range (tokenizer .vocab_size ()):
3514
- piece = tokenizer .IdToPiece (token_id )
3515
- text = piece .encode ("utf-8" )
3516
- score = tokenizer .GetScore (token_id )
3517
-
3518
- toktype = SentencePieceTokenTypes .NORMAL
3519
- if tokenizer .IsUnknown (token_id ):
3520
- toktype = SentencePieceTokenTypes .UNKNOWN
3521
- elif tokenizer .IsControl (token_id ):
3522
- toktype = SentencePieceTokenTypes .CONTROL
3523
- elif tokenizer .IsUnused (token_id ):
3524
- toktype = SentencePieceTokenTypes .UNUSED
3525
- elif tokenizer .IsByte (token_id ):
3526
- toktype = SentencePieceTokenTypes .BYTE
3527
-
3528
- tokens [token_id ] = text
3529
- scores [token_id ] = score
3530
- toktypes [token_id ] = toktype
3531
-
3532
- if vocab_size > len (tokens ):
3533
- pad_count = vocab_size - len (tokens )
3534
- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3535
- for i in range (1 , pad_count + 1 ):
3536
- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3537
- scores .append (- 1000.0 )
3538
- toktypes .append (SentencePieceTokenTypes .UNUSED )
3539
-
3540
- # realign tokens (see HF tokenizer code)
3541
- tokens = [b'<s>' , b'<pad>' , b'</s>' , b'<unk>' ] + tokens [3 :- 1 ]
3542
- scores = [0.0 , 0.0 , 0.0 , 0.0 ] + scores [3 :- 1 ]
3543
- toktypes = [
3544
- SentencePieceTokenTypes .CONTROL ,
3545
- SentencePieceTokenTypes .CONTROL ,
3546
- SentencePieceTokenTypes .CONTROL ,
3547
- SentencePieceTokenTypes .UNKNOWN ,
3548
- ] + toktypes [3 :- 1 ]
3549
-
3550
- self .gguf_writer .add_tokenizer_model ("t5" )
3551
- self .gguf_writer .add_tokenizer_pre ("default" )
3552
- self .gguf_writer .add_token_list (tokens )
3553
- self .gguf_writer .add_token_scores (scores )
3554
- self .gguf_writer .add_token_types (toktypes )
3555
- self .gguf_writer .add_add_space_prefix (add_prefix )
3556
- self .gguf_writer .add_token_type_count (self .hparams .get ("type_vocab_size" , 1 ))
3557
- self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3558
- if precompiled_charsmap :
3559
- self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3560
-
3561
- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3562
- special_vocab .add_to_gguf (self .gguf_writer )
3563
-
3564
- self .gguf_writer .add_add_bos_token (True )
3565
- self .gguf_writer .add_add_eos_token (True )
3589
+ self ._xlmroberta_set_vocab ()
3566
3590
3567
3591
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3568
3592
# if name starts with "roberta.", remove the prefix
0 commit comments