Skip to content

Commit db4b8ac

Browse files
committed
refactor: Standardize vocabulary handling with HfVocab
- Replaced VocabLoader with HfVocab, aligning vocabulary handling across classes. - Updated initialization of HfVocab with local_files_only=True for AutoTokenizer. - Introduced optional parameter fname_added_tokens for flexible added token management. - Streamlined added token handling for clarity and conciseness. - Maintained special tokens and IDs, enhancing token management. - Simplified token processing methods for improved readability. - Added a placeholder for score computation with a default value of -1000.0. - Optimized newline token check for efficiency. - Updated __repr__ function for clarity in representation. - Adjusted type alias Vocab to include BpeVocab, SentencePieceVocab, and HfVocab. - Removed redundant code related to special token handling, reverse vocabulary mapping, and vocabulary file detection. This refactoring promotes a standardized and modular approach to vocabulary management, facilitating future integration with a VocabFactory and improving code maintainability and scalability.
1 parent 3ca2b10 commit db4b8ac

File tree

1 file changed

+66
-102
lines changed

1 file changed

+66
-102
lines changed

convert.py

Lines changed: 66 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -508,92 +508,83 @@ def __repr__(self) -> str:
508508
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
509509

510510

511-
class VocabLoader:
512-
def __init__(self, params: Params, fname_tokenizer: Path) -> None:
513-
try:
514-
from transformers import AutoTokenizer
515-
except ImportError as e:
516-
raise ImportError(
517-
"To use VocabLoader, please install the `transformers` package. "
518-
"You can install it with `pip install transformers`."
519-
) from e
520-
521-
try:
522-
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
523-
except ValueError:
524-
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
525-
526-
self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
527-
528-
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
529-
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
530-
continue
531-
532-
self.added_tokens_dict[tok] = tokidx
511+
class HfVocab:
512+
def __init__(
513+
self,
514+
fname_tokenizer: Path,
515+
fname_added_tokens: Optional[Path] = None,
516+
) -> None:
517+
print("fname_tokenizer:", fname_tokenizer)
518+
# Allow the tokenizer to default to slow or fast versions.
519+
# Explicitly set tokenizer to use local paths.
520+
self.tokenizer = AutoTokenizer.from_pretrained(
521+
fname_tokenizer,
522+
cache_dir=fname_tokenizer,
523+
local_files_only=True,
524+
)
533525

534-
self.unk_token_id: int = self.tokenizer.unk_token_id
535-
self.specials: dict[str, int] = {
526+
# Initialize lists and dictionaries for added tokens
527+
self.added_tokens_list = []
528+
self.added_tokens_dict = dict()
529+
self.added_tokens_ids = set()
530+
531+
# Process added tokens
532+
for tok, tokidx in sorted(
533+
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
534+
):
535+
# Only consider added tokens that are not in the base vocabulary
536+
if tokidx >= self.tokenizer.vocab_size:
537+
self.added_tokens_list.append(tok)
538+
self.added_tokens_dict[tok] = tokidx
539+
self.added_tokens_ids.add(tokidx)
540+
541+
# Store special tokens and their IDs
542+
self.specials = {
536543
tok: self.tokenizer.get_vocab()[tok]
537544
for tok in self.tokenizer.all_special_tokens
538545
}
539-
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
540-
self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
541-
self.vocab_size_base: int = self.tokenizer.vocab_size
542-
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
543-
self.fname_tokenizer: Path = fname_tokenizer
544-
545-
vocab_file = "tokenizer.model"
546-
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
547-
if path_candidate is not None:
548-
self.spm = SentencePieceProcessor(str(path_candidate))
549-
print(self.spm.vocab_size(), self.vocab_size_base)
550-
else:
551-
self.spm = None
546+
self.special_ids = set(self.tokenizer.all_special_ids)
552547

553-
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
554-
added_tokens_ids = set(self.added_tokens_dict.values())
548+
# Set vocabulary sizes
549+
self.vocab_size_base = self.tokenizer.vocab_size
550+
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
555551

556-
for i in range(self.vocab_size_base):
557-
if i in added_tokens_ids:
558-
continue
552+
self.fname_tokenizer = fname_tokenizer
553+
self.fname_added_tokens = fname_added_tokens
559554

560-
text = self.reverse_vocab[i].encode("utf-8")
561-
yield text, self.get_token_score(i), self.get_token_type(i)
555+
def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
556+
reverse_vocab = {
557+
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
558+
}
562559

563-
def get_token_type(self, token_id: int) -> gguf.TokenType:
564-
toktype = gguf.TokenType.NORMAL
560+
for token_id in range(self.vocab_size_base):
561+
# Skip processing added tokens here
562+
if token_id in self.added_tokens_ids:
563+
continue
565564

566-
if self.spm is not None and token_id < self.spm.vocab_size():
567-
if self.spm.is_unknown(token_id):
568-
toktype = gguf.TokenType.UNKNOWN
569-
if self.spm.is_control(token_id):
570-
toktype = gguf.TokenType.CONTROL
571-
if self.spm.is_unused(token_id):
572-
toktype = gguf.TokenType.UNUSED
573-
if self.spm.is_byte(token_id):
574-
toktype = gguf.TokenType.BYTE
575-
else:
576-
token = self.reverse_vocab[token_id]
577-
if token_id == self.unk_token_id:
578-
toktype = gguf.TokenType.UNKNOWN
579-
elif token_id in self.special_ids:
580-
toktype = gguf.TokenType.CONTROL
581-
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
582-
toktype = gguf.TokenType.BYTE
565+
# Convert token text to bytes
566+
token_text = reverse_vocab[token_id].encode("utf-8")
567+
568+
# Yield token text, score, and type
569+
yield token_text, self.get_token_score(token_id), self.get_token_type(
570+
token_id, self.special_ids # Reuse already stored special IDs
571+
)
583572

584-
return toktype
573+
def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
574+
# Determine token type based on whether it's a special token
575+
return (
576+
gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
577+
)
585578

586579
def get_token_score(self, token_id: int) -> float:
587-
if self.spm is not None and token_id < self.spm.vocab_size():
588-
return cast(float, self.spm.get_score(token_id))
589-
return 0.0
580+
# Placeholder for actual logic to determine the token's score
581+
# This needs to be implemented based on specific requirements
582+
return -1000.0 # Default score
590583

591584
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
592-
593-
for text in self.added_tokens_dict:
585+
for text in self.added_tokens_list:
594586
if text in self.specials:
595-
596-
toktype = self.get_token_type(self.specials[text])
587+
toktype = self.get_token_type(self.specials[text], self.special_ids)
597588
score = self.get_token_score(self.specials[text])
598589

599590
else:
@@ -602,45 +593,18 @@ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
602593

603594
yield text.encode("utf-8"), score, toktype
604595

605-
def has_newline_token(self) -> bool:
606-
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
596+
def has_newline_token(self):
597+
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
607598

608599
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
609600
yield from self.hf_tokens()
610601
yield from self.added_tokens()
611602

612-
def get_vocab_type(self) -> str:
613-
path_candidates = []
614-
vocab_file = "tokenizer.model"
615-
path_candidates.append(vocab_file)
616-
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
617-
if path_candidate is not None:
618-
return "llama"
619-
620-
vocab_file = "vocab.json"
621-
path_candidates.append(vocab_file)
622-
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
623-
if path_candidate is not None:
624-
return "gpt2"
625-
626-
vocab_file = "tokenizer.json"
627-
path_candidates.append(vocab_file)
628-
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
629-
if path_candidate:
630-
if not self.has_newline_token():
631-
return "gpt2"
632-
return "llama"
633-
634-
raise FileNotFoundError(
635-
f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
636-
"if it's in another directory, pass the directory as --vocab-dir"
637-
)
638-
639603
def __repr__(self) -> str:
640-
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
604+
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
641605

642606

643-
Vocab: TypeAlias = 'VocabLoader'
607+
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
644608

645609

646610
#

0 commit comments

Comments
 (0)