Skip to content

Commit f6d5fe3

Browse files
committed
Use some tricks to eliminate the necessity for a new format
1 parent 41a2ed0 commit f6d5fe3

File tree

3 files changed

+43
-46
lines changed

3 files changed

+43
-46
lines changed

convert.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
142142
@dataclass
143143
class Params:
144144
n_vocab: int
145-
n_vocab_sp:int
145+
n_vocab_base: int
146146
n_embd: int
147147
n_mult: int
148148
n_head: int
@@ -170,7 +170,7 @@ def guessed(model: 'LazyModel') -> 'Params':
170170

171171
return Params(
172172
n_vocab = n_vocab,
173-
n_vocab_sp= n_vocab,
173+
n_vocab_base=n_vocab,
174174
n_embd = n_embd,
175175
n_mult = 256,
176176
n_head = n_head,
@@ -193,7 +193,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
193193

194194
return Params(
195195
n_vocab = n_vocab,
196-
n_vocab_sp= n_vocab,
196+
n_vocab_base=n_vocab,
197197
n_embd = n_embd,
198198
n_mult = n_mult,
199199
n_head = n_head,
@@ -218,7 +218,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
218218

219219
return Params(
220220
n_vocab = n_vocab,
221-
n_vocab_sp= n_vocab
221+
n_vocab_base=n_vocab,
222222
n_embd = n_embd,
223223
n_mult = n_mult,
224224
n_head = n_head,
@@ -283,7 +283,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn
283283
else:
284284
tokenizer_config = {}
285285
for key, value in tokenizer_config.items():
286-
if not isinstance(value, dict) or not isinstance(value, str):
286+
if not isinstance(value, dict) and not isinstance(value, str):
287287
continue
288288
token_id = TOKEN_NAME_TO_ID.get(key, -1)
289289
if token_id == -1:
@@ -296,15 +296,13 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn
296296
else:
297297
special_tokens = {}
298298
for key, value in special_tokens.items():
299-
if not isinstance(value, dict) or not isinstance(value, str):
299+
if not isinstance(value, dict) and not isinstance(value, str):
300300
continue
301301
token_id = TOKEN_NAME_TO_ID.get(key, -1)
302302
if token_id == -1 or token_id in self.special_tokens_map:
303303
continue
304304
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value
305305

306-
self.vocab_special_size: int = len(self.added_tokens_list) + len(self.special_tokens_map)
307-
308306
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
309307
tokenizer = self.sentencepiece_tokenizer
310308
if self.vocabtype == "bpe":
@@ -361,7 +359,7 @@ def __init__(self, tokens: List[Tuple[bytes, float]]):
361359
self.tokens = tokens
362360
self.special_tokens = []
363361
self.vocab_size = len(tokens)
364-
self.vocab_special_size = 0
362+
self.vocab_size_base = 0
365363

366364
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
367365
return self.tokens
@@ -1120,17 +1118,21 @@ def __init__(self, fname_out: Path) -> None:
11201118
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
11211119
self.fout.write(b"ggjt"[::-1]) # magic
11221120
values = [
1123-
4, # file version
1121+
1, # file version
11241122
params.n_vocab,
1125-
params.n_vocab_sp,
11261123
params.n_embd,
11271124
params.n_mult,
11281125
params.n_head,
11291126
params.n_layer,
1127+
<<<<<<< HEAD
11301128
params.n_embd // params.n_head, # rot (obsolete)
11311129
file_type.value,
1130+
=======
1131+
params.n_vocab_base | 0xF0000000, # reuse obsolete rot value to store vocab_base
1132+
params.file_type.value,
1133+
>>>>>>> bfccc62 (Use some tricks to eliminate the necessity for a new format)
11321134
]
1133-
self.fout.write(struct.pack("i" * len(values), *values))
1135+
self.fout.write(struct.pack("I" * len(values), *values))
11341136

11351137
def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
11361138
sname = name.encode('utf-8')
@@ -1144,13 +1146,11 @@ def write_vocab(self, vocab: Vocab) -> None:
11441146
self.fout.write(struct.pack("i", len(text)))
11451147
self.fout.write(text)
11461148
self.fout.write(struct.pack("f", score))
1147-
for token_id in vocab.all_special_tokens():
1148-
self.fout.write(struct.pack("i", token_id))
11491149

11501150
@staticmethod
11511151
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
11521152
of = OutputFile(fname_out)
1153-
params = Params(n_vocab=vocab.vocab_size, n_vocab_sp=vocab.vocab_special_size, n_embd=0, n_mult=0,
1153+
params = Params(n_vocab=vocab.vocab_size, n_vocab_base=vocab.vocab_size_base, n_embd=0, n_mult=0,
11541154
n_head=1, n_layer=0)
11551155
of = OutputFile(fname_out)
11561156
of.write_file_header(params, file_type=GGMLFileType.AllF32)
@@ -1373,7 +1373,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
13731373
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
13741374
vocab = load_vocab(vocab_dir, args.vocabtype)
13751375
params = Params.load(model_plus)
1376-
params.n_vocab_sp = vocab.vocab_special_size
1376+
params.n_vocab_base = vocab.vocab_size_base
13771377
model = model_plus.model
13781378
model = do_necessary_conversions(model, params)
13791379
output_type = pick_output_type(model, args.outtype)

llama.cpp

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,13 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
181181
// default hparams (LLaMA 7B)
182182
struct llama_hparams {
183183
uint32_t n_vocab = 32000;
184-
uint32_t n_vocab_sp = 0;
184+
uint32_t n_vocab_base = 32000;
185185
uint32_t n_ctx = 512; // this is provided as user input?
186186
uint32_t n_embd = 4096;
187187
uint32_t n_mult = 256;
188188
uint32_t n_head = 32;
189189
uint32_t n_head_kv = 32;
190190
uint32_t n_layer = 32;
191-
uint32_t n_rot = 64;
192191

193192
// LLaMAv2
194193
// TODO: load from model data hparams
@@ -499,7 +498,6 @@ enum llama_file_version {
499498
LLAMA_FILE_VERSION_GGJT_V1, // added padding
500499
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
501500
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
502-
LLAMA_FILE_VERSION_GGJT_V4, // improved support for added/special tokens
503501
};
504502

505503
struct llama_file_loader {
@@ -515,6 +513,7 @@ struct llama_file_loader {
515513
read_hparams();
516514
read_vocab();
517515
read_tensor_metadata(tensors_map);
516+
set_vocab_sp();
518517
}
519518
void read_magic() {
520519
uint32_t magic = file.read_u32();
@@ -537,7 +536,6 @@ struct llama_file_loader {
537536
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
538537
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
539538
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
540-
case 4: file_version = LLAMA_FILE_VERSION_GGJT_V4; return;
541539
}
542540
}
543541

@@ -546,18 +544,18 @@ struct llama_file_loader {
546544
}
547545
void read_hparams() {
548546
hparams.n_vocab = file.read_u32();
549-
hparams.n_vocab_sp = file_version >= LLAMA_FILE_VERSION_GGJT_V4 ? file.read_u32() : 0;
550547
hparams.n_embd = file.read_u32();
551548
hparams.n_mult = file.read_u32();
552549
hparams.n_head = file.read_u32();
553550
hparams.n_layer = file.read_u32();
554-
hparams.n_rot = file.read_u32();
551+
hparams.n_vocab_base = file.read_u32();
552+
hparams.n_vocab_base = (hparams.n_vocab_base & 0xF0000000) == 0 ? hparams.n_vocab : (hparams.n_vocab_base & ~0xF0000000); // this bitwise operation is necessary for compatibility with older models
555553
hparams.ftype = (enum llama_ftype) file.read_u32();
556554

557555
// LLaMAv2
558556
// TODO: read from header
559557
hparams.n_head_kv = hparams.n_head;
560-
}
558+
=======
561559
void read_vocab() {
562560
vocab.id_to_token.resize(hparams.n_vocab);
563561

@@ -574,20 +572,6 @@ struct llama_file_loader {
574572
tok_score.tok = std::move(word);
575573
tok_score.score = score;
576574
}
577-
578-
vocab.special_token_to_id.reserve(hparams.n_vocab_sp);
579-
580-
for (uint32_t i = 0; i < hparams.n_vocab_sp; i++) {
581-
llama_vocab::id token_id = file.read_u32();
582-
const auto & word = vocab.id_to_token[token_id].tok;
583-
584-
vocab.special_token_trie.add(word);
585-
vocab.special_token_to_id[word] = token_id;
586-
587-
if (vocab.max_special_token_length < word.size()) {
588-
vocab.max_special_token_length = word.size();
589-
}
590-
}
591575
}
592576
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
593577
while (file.tell() < file.size) {
@@ -634,6 +618,24 @@ struct llama_file_loader {
634618
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
635619
}
636620
}
621+
void set_vocab_sp() {
622+
uint32_t vocab_sp = 3 + hparams.n_vocab - hparams.n_vocab_base;
623+
vocab.special_token_to_id.reserve(vocab_sp);
624+
for (uint32_t i = 0; i < vocab_sp; i++) {
625+
llama_vocab::id token_id = i > 2 ? hparams.n_vocab_base + i : i;
626+
const auto & word = vocab.id_to_token[token_id].tok;
627+
if (word.empty()) {
628+
continue;
629+
}
630+
631+
vocab.special_token_trie.add(word);
632+
vocab.special_token_to_id[word] = token_id;
633+
634+
if (vocab.max_special_token_length < word.size()) {
635+
vocab.max_special_token_length = word.size();
636+
}
637+
}
638+
}
637639
};
638640

639641
struct llama_file_saver {
@@ -653,12 +655,11 @@ struct llama_file_saver {
653655
void write_hparams(enum llama_ftype new_ftype) {
654656
const llama_hparams & hparams = any_file_loader->hparams;
655657
file.write_u32(hparams.n_vocab);
656-
file.write_u32(hparams.n_vocab_sp);
657658
file.write_u32(hparams.n_embd);
658659
file.write_u32(hparams.n_mult);
659660
file.write_u32(hparams.n_head);
660661
file.write_u32(hparams.n_layer);
661-
file.write_u32(hparams.n_rot);
662+
file.write_u32(hparams.n_vocab_base | 0xF0000000); // this bitwise operation is necessary for compatibility with older models
662663
file.write_u32(new_ftype);
663664
}
664665
void write_vocab() {
@@ -672,9 +673,6 @@ struct llama_file_saver {
672673
file.write_raw(token_score.tok.data(), token_score.tok.size());
673674
file.write_raw(&token_score.score, sizeof(token_score.score));
674675
}
675-
for (const auto & pair : any_file_loader->vocab.special_token_to_id) {
676-
file.write_u32(pair.second);
677-
}
678676
}
679677
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
680678
switch (new_type) {
@@ -1001,8 +999,7 @@ static const char *llama_file_version_name(llama_file_version version) {
1001999
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
10021000
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
10031001
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
1004-
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (pre #1931)";
1005-
case LLAMA_FILE_VERSION_GGJT_V4: return "ggjt v4 (latest)";
1002+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
10061003
}
10071004

10081005
return "unknown";
@@ -1127,7 +1124,7 @@ static void llama_model_load_internal(
11271124
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
11281125
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
11291126
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1130-
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1127+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_embd/hparams.n_head); // a.k.a. n_embd_head, n_head_dim
11311128
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
11321129
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
11331130
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);

llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
4141
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
4242

43-
#define LLAMA_FILE_VERSION 4
43+
#define LLAMA_FILE_VERSION 3
4444
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
4545
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
4646
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN

0 commit comments

Comments
 (0)