Skip to content

Commit 6e15f9b

Browse files
authored
Merge branch 'master' into merge-to-upstream-v2
2 parents 56b8229 + 22f281a commit 6e15f9b

25 files changed

+1221
-842
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,7 @@ jobs:
860860
mkdir build
861861
cd build
862862
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
863-
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
863+
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
864864
865865
- name: Determine tag name
866866
id: tag

convert_hf_to_gguf.py

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class Model:
4848

4949
dir_model: Path
5050
ftype: gguf.LlamaFileType
51-
fname_out: Path | None
51+
fname_out: Path
5252
is_big_endian: bool
5353
endianess: gguf.GGUFEndian
5454
use_temp_file: bool
@@ -62,11 +62,12 @@ class Model:
6262
gguf_writer: gguf.GGUFWriter
6363
model_name: str | None
6464
metadata_override: Path | None
65+
dir_model_card: Path
6566

6667
# subclasses should define this!
6768
model_arch: gguf.MODEL_ARCH
6869

69-
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
70+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
7071
use_temp_file: bool = False, eager: bool = False,
7172
metadata_override: Path | None = None, model_name: str | None = None,
7273
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@@ -90,6 +91,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path |
9091
self.tensor_names = None
9192
self.metadata_override = metadata_override
9293
self.model_name = model_name
94+
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
9395

9496
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
9597
if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -345,7 +347,7 @@ def prepare_metadata(self, vocab_only: bool):
345347

346348
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
347349

348-
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
350+
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
349351

350352
# Fallback to model directory name if metadata name is still missing
351353
if self.metadata.name is None:
@@ -359,27 +361,22 @@ def prepare_metadata(self, vocab_only: bool):
359361
output_type: str = self.ftype.name.partition("_")[2]
360362

361363
# Filename Output
362-
# Note: `not is_dir()` is used because `.is_file()` will not detect
363-
# file template strings as it doesn't actually exist as a file
364-
if self.fname_out is not None and not self.fname_out.is_dir():
365-
# Output path is a custom defined templated filename
366-
367-
# Process templated file name with the output ftype, useful with the "auto" ftype
368-
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
369-
else:
364+
if self.fname_out.is_dir():
370365
# Generate default filename based on model specification and available metadata
371366
if not vocab_only:
372367
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
373368
else:
374369
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
375370

376-
# Check if preferred output directory path was provided
377-
if self.fname_out is not None and self.fname_out.is_dir():
378-
# output path is a directory
379-
self.fname_out = self.fname_out / f"{fname_default}.gguf"
380-
else:
381-
# output in the same directory as the model by default
382-
self.fname_out = self.dir_model / f"{fname_default}.gguf"
371+
# Use the default filename
372+
self.fname_out = self.fname_out / f"{fname_default}.gguf"
373+
else:
374+
# Output path is a custom defined templated filename
375+
# Note: `not is_dir()` is used because `.is_file()` will not detect
376+
# file template strings as it doesn't actually exist as a file
377+
378+
# Process templated file name with the output ftype, useful with the "auto" ftype
379+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
383380

384381
self.set_type()
385382

@@ -596,6 +593,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
596593
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
597594
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
598595
res = "codeshell"
596+
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
597+
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
598+
res = "tekken"
599599

600600
if res is None:
601601
logger.warning("\n")
@@ -753,7 +753,8 @@ def _create_vocab_sentencepiece(self):
753753
token_id = int(token_id)
754754
token: str = token_data["content"]
755755
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
756-
assert tokens[token_id] == token.encode("utf-8")
756+
if tokens[token_id] != token.encode("utf-8"):
757+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
757758
if token_data.get("special") or self.does_token_look_special(token):
758759
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
759760
else:
@@ -1312,6 +1313,7 @@ def set_vocab(self):
13121313
special_vocab._set_special_token("prefix", 1)
13131314
special_vocab._set_special_token("suffix", 3)
13141315
special_vocab._set_special_token("middle", 2)
1316+
special_vocab.chat_template = None # do not add it twice
13151317
special_vocab.add_to_gguf(self.gguf_writer)
13161318

13171319
def set_gguf_parameters(self):
@@ -2014,7 +2016,8 @@ def set_vocab(self):
20142016
token_id = int(token_id)
20152017
token = foken_data["content"].encode("utf-8")
20162018
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2017-
assert tokens[token_id] == token
2019+
if tokens[token_id] != token:
2020+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
20182021
tokens[token_id] = token
20192022
scores[token_id] = -1000.0
20202023
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2030,7 +2033,8 @@ def set_vocab(self):
20302033
token_id = int(foken_data["id"])
20312034
token = foken_data["content"].encode("utf-8")
20322035
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2033-
assert tokens[token_id] == token
2036+
if tokens[token_id] != token:
2037+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
20342038
tokens[token_id] = token
20352039
scores[token_id] = -1000.0
20362040
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2269,7 +2273,8 @@ def set_vocab(self):
22692273
chat_eos_token_id = token_id
22702274
token = token.encode("utf-8")
22712275
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2272-
assert(tokens[token_id] == token)
2276+
if tokens[token_id] != token:
2277+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
22732278
tokens[token_id] = token
22742279
scores[token_id] = -1000.0
22752280
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2288,7 +2293,8 @@ def set_vocab(self):
22882293
chat_eos_token_id = token_id
22892294
token = token.encode("utf-8")
22902295
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2291-
assert(tokens[token_id] == token)
2296+
if tokens[token_id] != token:
2297+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
22922298
tokens[token_id] = token
22932299
scores[token_id] = -1000.0
22942300
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2474,6 +2480,7 @@ def set_vocab(self):
24742480
special_vocab._set_special_token("middle", 68)
24752481
special_vocab._set_special_token("fsep", 70)
24762482
special_vocab._set_special_token("eot", 107)
2483+
special_vocab.chat_template = None # do not add it twice
24772484
special_vocab.add_to_gguf(self.gguf_writer)
24782485

24792486
self.gguf_writer.add_add_space_prefix(False)
@@ -3627,10 +3634,10 @@ def main() -> None:
36273634
logger.error("Error: Cannot use temp file when splitting")
36283635
sys.exit(1)
36293636

3630-
fname_out = None
3631-
36323637
if args.outfile is not None:
36333638
fname_out = args.outfile
3639+
else:
3640+
fname_out = dir_model
36343641

36353642
logger.info(f"Loading model: {dir_model.name}")
36363643

@@ -3661,7 +3668,6 @@ def main() -> None:
36613668
else:
36623669
logger.info("Exporting model...")
36633670
model_instance.write()
3664-
assert model_instance.fname_out is not None
36653671
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
36663672
logger.info(f"Model successfully exported to {out_path}")
36673673

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class TOKENIZER_TYPE(IntEnum):
9292
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
9393
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
9494
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
95+
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
9596
]
9697

9798

convert_lora_to_gguf.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ def parse_args() -> argparse.Namespace:
290290
fname_out = args.outfile
291291
else:
292292
# output in the same directory as the model by default
293-
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
293+
fname_out = dir_lora
294294

295295
if os.path.exists(input_model):
296296
# lazy import load_file only if lora is in safetensors format.
@@ -304,12 +304,6 @@ def parse_args() -> argparse.Namespace:
304304
# load base model
305305
logger.info(f"Loading base model: {dir_base_model.name}")
306306
hparams = Model.load_hparams(dir_base_model)
307-
308-
with open(lora_config, "r") as f:
309-
lparams: dict[str, Any] = json.load(f)
310-
311-
alpha: float = lparams["lora_alpha"]
312-
313307
with torch.inference_mode():
314308
try:
315309
model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -320,12 +314,21 @@ def parse_args() -> argparse.Namespace:
320314
class LoraModel(model_class):
321315
model_arch = model_class.model_arch
322316

317+
lora_alpha: float
318+
319+
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
320+
321+
super().__init__(*args, **kwargs)
322+
323+
self.dir_model_card = dir_lora_model
324+
self.lora_alpha = float(lora_alpha)
325+
323326
def set_type(self):
324327
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
325328
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
326329

327330
def set_gguf_parameters(self):
328-
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
331+
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
329332
super().set_gguf_parameters()
330333

331334
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@@ -368,6 +371,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
368371
yield (dest_name + ".lora_a", lora_a)
369372
yield (dest_name + ".lora_b", lora_b)
370373

374+
with open(lora_config, "r") as f:
375+
lparams: dict[str, Any] = json.load(f)
376+
377+
alpha: float = lparams["lora_alpha"]
378+
371379
model_instance = LoraModel(
372380
dir_base_model,
373381
ftype,
@@ -376,6 +384,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
376384
use_temp_file=False,
377385
eager=args.no_lazy,
378386
dry_run=args.dry_run,
387+
dir_lora_model=dir_lora,
388+
lora_alpha=alpha,
379389
)
380390

381391
logger.info("Exporting model...")

examples/gguf/gguf.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
9292

9393
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
9494

95+
if (!ctx) {
96+
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
97+
return false;
98+
}
99+
95100
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
96101
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
97102
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,12 @@ actor LlamaContext {
2626
private var context: OpaquePointer
2727
private var batch: llama_batch
2828
private var tokens_list: [llama_token]
29+
var is_done: Bool = false
2930

3031
/// This variable is used to store temporarily invalid cchars
3132
private var temporary_invalid_cchars: [CChar]
3233

33-
var n_len: Int32 = 64
34+
var n_len: Int32 = 1024
3435
var n_cur: Int32 = 0
3536

3637
var n_decode: Int32 = 0
@@ -160,6 +161,7 @@ actor LlamaContext {
160161

161162
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
162163
print("\n")
164+
is_done = true
163165
let new_token_str = String(cString: temporary_invalid_cchars + [0])
164166
temporary_invalid_cchars.removeAll()
165167
return new_token_str

examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
132132
messageLog += "\(text)"
133133

134134
Task.detached {
135-
while await llamaContext.n_cur < llamaContext.n_len {
135+
while await !llamaContext.is_done {
136136
let result = await llamaContext.completion_loop()
137137
await MainActor.run {
138138
self.messageLog += "\(result)"

0 commit comments

Comments
 (0)