Skip to content

Commit d2bb3ac

Browse files
committed
convert.py : remove GGML vocab + other obsolete stuff
1 parent 68f5348 commit d2bb3ac

File tree

1 file changed

+17
-29
lines changed

1 file changed

+17
-29
lines changed

convert-new.py

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -278,19 +278,7 @@ def __repr__(self) -> str:
278278
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
279279

280280

281-
class GGMLVocab:
282-
def __init__(self, tokens: List[Tuple[bytes, float]]):
283-
self.tokens = tokens
284-
self.vocab_size = len(tokens)
285-
286-
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
287-
return self.tokens
288-
289-
def __repr__(self) -> str:
290-
return f"<GGMLVocab with {self.vocab_size} tokens>"
291-
292-
293-
Vocab = Union[SentencePieceVocab, GGMLVocab]
281+
Vocab = Union[SentencePieceVocab]
294282

295283

296284
def permute(weights: NDArray, n_head: int) -> NDArray:
@@ -691,7 +679,6 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
691679

692680
def check_vocab_size(params: Params, vocab: Vocab) -> None:
693681
if params.n_vocab != vocab.vocab_size:
694-
# GGMLVocab comes from the same file as the model so shouldn't mismatch:
695682
assert isinstance(vocab, SentencePieceVocab)
696683
if params.n_vocab == vocab.vocab_size_base:
697684
print("Ignoring added_tokens.json since model matches vocab size without it.")
@@ -874,7 +861,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
874861
if path.is_dir():
875862
vocab_file = "tokenizer.model"
876863
if vocabtype == 'bpe':
877-
vocab_file = "vocab.json"
864+
vocab_file = "vocab.json"
878865
path2 = path / vocab_file
879866
# Use `.parent` instead of /.. to handle the symlink case better.
880867
path3 = path.parent / vocab_file
@@ -916,15 +903,14 @@ def do_dump_model(model_plus: ModelPlus) -> None:
916903

917904
def main(args_in: Optional[List[str]] = None) -> None:
918905
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
919-
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
920-
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
921-
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
922-
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
923-
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
924-
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
925-
parser.add_argument("model", type=Path,
926-
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
927-
parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
906+
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
907+
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
908+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
909+
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
910+
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
911+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
912+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
913+
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)")
928914
args = parser.parse_args(args_in)
929915

930916
vocab: Vocab
@@ -947,12 +933,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
947933
else:
948934
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
949935
vocab = load_vocab(vocab_dir, args.vocabtype)
950-
params = Params.load(model_plus)
951-
model = model_plus.model
952-
model = do_necessary_conversions(model, params)
936+
937+
params = Params.load(model_plus)
938+
model = model_plus.model
939+
model = do_necessary_conversions(model, params)
953940
output_type = pick_output_type(model, args.outtype)
954-
model = convert_to_output_type(model, output_type)
955-
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
941+
model = convert_to_output_type(model, output_type)
942+
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
943+
956944
OutputFile.write_all(outfile, params, output_type, model, vocab)
957945
print(f"Wrote {outfile}")
958946

0 commit comments

Comments
 (0)