@@ -508,92 +508,83 @@ def __repr__(self) -> str:
508
508
return f"<SentencePieceVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
509
509
510
510
511
- class VocabLoader :
512
- def __init__ (self , params : Params , fname_tokenizer : Path ) -> None :
513
- try :
514
- from transformers import AutoTokenizer
515
- except ImportError as e :
516
- raise ImportError (
517
- "To use VocabLoader, please install the `transformers` package. "
518
- "You can install it with `pip install transformers`."
519
- ) from e
520
-
521
- try :
522
- self .tokenizer = AutoTokenizer .from_pretrained (str (fname_tokenizer ), trust_remote_code = True )
523
- except ValueError :
524
- self .tokenizer = AutoTokenizer .from_pretrained (str (fname_tokenizer ), use_fast = False , trust_remote_code = True )
525
-
526
- self .added_tokens_dict : OrderedDict [str , int ] = OrderedDict ()
527
-
528
- for tok , tokidx in sorted (self .tokenizer .get_added_vocab ().items (), key = lambda x : x [1 ]):
529
- if tokidx >= params .n_vocab or tokidx < self .tokenizer .vocab_size :
530
- continue
531
-
532
- self .added_tokens_dict [tok ] = tokidx
511
+ class HfVocab :
512
+ def __init__ (
513
+ self ,
514
+ fname_tokenizer : Path ,
515
+ fname_added_tokens : Optional [Path ] = None ,
516
+ ) -> None :
517
+ print ("fname_tokenizer:" , fname_tokenizer )
518
+ # Allow the tokenizer to default to slow or fast versions.
519
+ # Explicitly set tokenizer to use local paths.
520
+ self .tokenizer = AutoTokenizer .from_pretrained (
521
+ fname_tokenizer ,
522
+ cache_dir = fname_tokenizer ,
523
+ local_files_only = True ,
524
+ )
533
525
534
- self .unk_token_id : int = self .tokenizer .unk_token_id
535
- self .specials : dict [str , int ] = {
526
+ # Initialize lists and dictionaries for added tokens
527
+ self .added_tokens_list = []
528
+ self .added_tokens_dict = dict ()
529
+ self .added_tokens_ids = set ()
530
+
531
+ # Process added tokens
532
+ for tok , tokidx in sorted (
533
+ self .tokenizer .get_added_vocab ().items (), key = lambda x : x [1 ]
534
+ ):
535
+ # Only consider added tokens that are not in the base vocabulary
536
+ if tokidx >= self .tokenizer .vocab_size :
537
+ self .added_tokens_list .append (tok )
538
+ self .added_tokens_dict [tok ] = tokidx
539
+ self .added_tokens_ids .add (tokidx )
540
+
541
+ # Store special tokens and their IDs
542
+ self .specials = {
536
543
tok : self .tokenizer .get_vocab ()[tok ]
537
544
for tok in self .tokenizer .all_special_tokens
538
545
}
539
- self .special_ids : set [int ] = set (self .tokenizer .all_special_ids )
540
- self .reverse_vocab = {id : encoded_tok for encoded_tok , id in self .tokenizer .get_vocab ().items ()}
541
- self .vocab_size_base : int = self .tokenizer .vocab_size
542
- self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_dict )
543
- self .fname_tokenizer : Path = fname_tokenizer
544
-
545
- vocab_file = "tokenizer.model"
546
- path_candidate = find_vocab_file_path (self .fname_tokenizer , vocab_file )
547
- if path_candidate is not None :
548
- self .spm = SentencePieceProcessor (str (path_candidate ))
549
- print (self .spm .vocab_size (), self .vocab_size_base )
550
- else :
551
- self .spm = None
546
+ self .special_ids = set (self .tokenizer .all_special_ids )
552
547
553
- def hf_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
554
- added_tokens_ids = set (self .added_tokens_dict .values ())
548
+ # Set vocabulary sizes
549
+ self .vocab_size_base = self .tokenizer .vocab_size
550
+ self .vocab_size = self .vocab_size_base + len (self .added_tokens_list )
555
551
556
- for i in range (self .vocab_size_base ):
557
- if i in added_tokens_ids :
558
- continue
552
+ self .fname_tokenizer = fname_tokenizer
553
+ self .fname_added_tokens = fname_added_tokens
559
554
560
- text = self .reverse_vocab [i ].encode ("utf-8" )
561
- yield text , self .get_token_score (i ), self .get_token_type (i )
555
+ def hf_tokens (self ) -> Iterable [Tuple [bytes , float , gguf .TokenType ]]:
556
+ reverse_vocab = {
557
+ id : encoded_tok for encoded_tok , id in self .tokenizer .get_vocab ().items ()
558
+ }
562
559
563
- def get_token_type (self , token_id : int ) -> gguf .TokenType :
564
- toktype = gguf .TokenType .NORMAL
560
+ for token_id in range (self .vocab_size_base ):
561
+ # Skip processing added tokens here
562
+ if token_id in self .added_tokens_ids :
563
+ continue
565
564
566
- if self .spm is not None and token_id < self .spm .vocab_size ():
567
- if self .spm .is_unknown (token_id ):
568
- toktype = gguf .TokenType .UNKNOWN
569
- if self .spm .is_control (token_id ):
570
- toktype = gguf .TokenType .CONTROL
571
- if self .spm .is_unused (token_id ):
572
- toktype = gguf .TokenType .UNUSED
573
- if self .spm .is_byte (token_id ):
574
- toktype = gguf .TokenType .BYTE
575
- else :
576
- token = self .reverse_vocab [token_id ]
577
- if token_id == self .unk_token_id :
578
- toktype = gguf .TokenType .UNKNOWN
579
- elif token_id in self .special_ids :
580
- toktype = gguf .TokenType .CONTROL
581
- elif len (token ) == 6 and token .startswith ("<0x" ) and token .endswith (">" ):
582
- toktype = gguf .TokenType .BYTE
565
+ # Convert token text to bytes
566
+ token_text = reverse_vocab [token_id ].encode ("utf-8" )
567
+
568
+ # Yield token text, score, and type
569
+ yield token_text , self .get_token_score (token_id ), self .get_token_type (
570
+ token_id , self .special_ids # Reuse already stored special IDs
571
+ )
583
572
584
- return toktype
573
+ def get_token_type (self , token_id : int , special_ids : set ) -> gguf .TokenType :
574
+ # Determine token type based on whether it's a special token
575
+ return (
576
+ gguf .TokenType .CONTROL if token_id in special_ids else gguf .TokenType .NORMAL
577
+ )
585
578
586
579
def get_token_score (self , token_id : int ) -> float :
587
- if self . spm is not None and token_id < self . spm . vocab_size ():
588
- return cast ( float , self . spm . get_score ( token_id ))
589
- return 0.0
580
+ # Placeholder for actual logic to determine the token's score
581
+ # This needs to be implemented based on specific requirements
582
+ return - 1000.0 # Default score
590
583
591
584
def added_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
592
-
593
- for text in self .added_tokens_dict :
585
+ for text in self .added_tokens_list :
594
586
if text in self .specials :
595
-
596
- toktype = self .get_token_type (self .specials [text ])
587
+ toktype = self .get_token_type (self .specials [text ], self .special_ids )
597
588
score = self .get_token_score (self .specials [text ])
598
589
599
590
else :
@@ -602,45 +593,18 @@ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
602
593
603
594
yield text .encode ("utf-8" ), score , toktype
604
595
605
- def has_newline_token (self ) -> bool :
606
- return ' <0x0A>' in self .tokenizer .vocab or ' \n ' in self .tokenizer .vocab
596
+ def has_newline_token (self ):
597
+ return " <0x0A>" in self .tokenizer .vocab or " \n " in self .tokenizer .vocab
607
598
608
599
def all_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
609
600
yield from self .hf_tokens ()
610
601
yield from self .added_tokens ()
611
602
612
- def get_vocab_type (self ) -> str :
613
- path_candidates = []
614
- vocab_file = "tokenizer.model"
615
- path_candidates .append (vocab_file )
616
- path_candidate = find_vocab_file_path (self .fname_tokenizer , vocab_file )
617
- if path_candidate is not None :
618
- return "llama"
619
-
620
- vocab_file = "vocab.json"
621
- path_candidates .append (vocab_file )
622
- path_candidate = find_vocab_file_path (self .fname_tokenizer , vocab_file )
623
- if path_candidate is not None :
624
- return "gpt2"
625
-
626
- vocab_file = "tokenizer.json"
627
- path_candidates .append (vocab_file )
628
- path_candidate = find_vocab_file_path (self .fname_tokenizer , vocab_file )
629
- if path_candidate :
630
- if not self .has_newline_token ():
631
- return "gpt2"
632
- return "llama"
633
-
634
- raise FileNotFoundError (
635
- f"Could not find { path_candidates } in { self .fname_tokenizer } or its parent; "
636
- "if it's in another directory, pass the directory as --vocab-dir"
637
- )
638
-
639
603
def __repr__ (self ) -> str :
640
- return f"<VocabLoader with { self .vocab_size_base } base tokens and { len (self .added_tokens_dict )} added tokens>"
604
+ return f"<HfVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
641
605
642
606
643
- Vocab : TypeAlias = 'VocabLoader'
607
+ Vocab : TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
644
608
645
609
646
610
#
0 commit comments