Skip to content

Commit 8aa5818

Browse files
committed
feat: Introduce VocabFactory for flexible vocabulary management in model conversion
- The VocabFactory class is added to facilitate modular vocabulary handling. - The constructor initializes a directory path and detects vocabulary-related files. - The _select_file method provides file paths based on vocabulary type (e.g., BPE, SentencePiece). - _create_special_vocab generates special vocabularies, accommodating different types. - The load_vocab method loads vocabularies, handling BPE, SentencePiece, and Hugging Face Fast Tokenizer. - Error handling and logging enhance debugging and user feedback. - The modular and flexible design simplifies vocabulary management and supports future extensions. The VocabFactory class enhances code modularity and maintainability, allowing versatile vocabulary handling in the model conversion process.
1 parent 5fa1a08 commit 8aa5818

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

convert.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,6 +1355,83 @@ def load_some_model(path: Path) -> ModelPlus:
13551355
return model_plus
13561356

13571357

1358+
class VocabFactory:
1359+
def __init__(self, path: Path):
1360+
self.path = path
1361+
self.files = {
1362+
"tokenizer.model": None,
1363+
"vocab.json": None,
1364+
"tokenizer.json": None,
1365+
}
1366+
self._detect_files()
1367+
1368+
def _detect_files(self):
1369+
for file in self.files.keys():
1370+
file_path = self.path / file
1371+
parent_file_path = self.path.parent / file
1372+
if file_path.exists():
1373+
self.files[file] = file_path
1374+
elif parent_file_path.exists():
1375+
self.files[file] = parent_file_path
1376+
1377+
def _select_file(self, vocabtype: Optional[str]) -> Path:
1378+
if vocabtype in ["spm", "bpe"]:
1379+
# For SentencePiece and BPE, return specific files as before
1380+
file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
1381+
if self.files[file_key]:
1382+
return self.files[file_key]
1383+
else:
1384+
raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
1385+
elif vocabtype == "hfft":
1386+
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
1387+
return self.path
1388+
else:
1389+
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1390+
1391+
def _create_special_vocab(
1392+
self,
1393+
vocab: Vocab,
1394+
vocabtype: str,
1395+
model_parent_path: Path,
1396+
) -> gguf.SpecialVocab:
1397+
load_merges = vocabtype == "bpe"
1398+
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
1399+
return gguf.SpecialVocab(
1400+
model_parent_path,
1401+
load_merges=load_merges,
1402+
special_token_types=None, # Predetermined or passed as a parameter
1403+
n_vocab=n_vocab,
1404+
)
1405+
1406+
def load_vocab(
1407+
self, vocabtype: str, model_parent_path: Path
1408+
) -> Tuple[Vocab, gguf.SpecialVocab]:
1409+
path = self._select_file(vocabtype)
1410+
print(f"Loading vocab file '{path}', type '{vocabtype}'")
1411+
1412+
added_tokens_path = path.parent / "added_tokens.json"
1413+
if vocabtype == "bpe":
1414+
vocab = BpeVocab(
1415+
path, added_tokens_path if added_tokens_path.exists() else None
1416+
)
1417+
elif vocabtype == "spm":
1418+
vocab = SentencePieceVocab(
1419+
path, added_tokens_path if added_tokens_path.exists() else None
1420+
)
1421+
elif vocabtype == "hfft":
1422+
vocab = HfVocab(
1423+
path, added_tokens_path if added_tokens_path.exists() else None
1424+
)
1425+
else:
1426+
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1427+
special_vocab = self._create_special_vocab(
1428+
vocab,
1429+
vocabtype,
1430+
model_parent_path,
1431+
)
1432+
return vocab, special_vocab
1433+
1434+
13581435
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
13591436
namestr = {
13601437
GGMLFileType.AllF32: "f32",

0 commit comments

Comments
 (0)