Skip to content

Commit 7a37ab8

Browse files
committed
Merge branch 'fm5319' into fallback-memtype-5319
2 parents aa7d31e + 9615d2f commit 7a37ab8

24 files changed

+2837
-2088
lines changed

CMakeLists.txt

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -809,9 +809,9 @@ if (LLAMA_CCACHE)
809809
if (LLAMA_CCACHE_FOUND)
810810
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
811811
set(ENV{CCACHE_SLOPPINESS} time_macros)
812-
message(STATUS "Using ccache")
812+
message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.")
813813
else()
814-
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
814+
message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF")
815815
endif ()
816816
endif()
817817
@@ -850,7 +850,9 @@ endif()
850850
851851
set(ARCH_FLAGS "")
852852
853-
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
853+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
854+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
855+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
854856
message(STATUS "ARM detected")
855857
if (MSVC)
856858
add_compile_definitions(__ARM_NEON)
@@ -876,7 +878,9 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
876878
list(APPEND ARCH_FLAGS -mno-unaligned-access)
877879
endif()
878880
endif()
879-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
881+
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
882+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
883+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
880884
message(STATUS "x86 detected")
881885
if (MSVC)
882886
# instruction set detection for MSVC only

Makefile

Lines changed: 119 additions & 50 deletions
Large diffs are not rendered by default.

README-sycl.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -311,15 +311,13 @@ Output (example):
311311

312312
a. Download & install cmake for Windows: https://cmake.org/download/
313313

314-
b. Download & install make for Windows provided by mingw-w64
314+
b. Download & install mingw-w64 make for Windows provided by w64devkit
315315

316-
- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases.
316+
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
317317

318-
Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z).
318+
- Extract `w64devkit` on your pc.
319319

320-
- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**.
321-
322-
- Add the **bin** folder path in the Windows system PATH environment.
320+
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
323321

324322
### Build locally:
325323

README.md

Lines changed: 86 additions & 111 deletions
Large diffs are not rendered by default.

SHA256SUMS

Lines changed: 0 additions & 40 deletions
This file was deleted.

common/common.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@
4646
#define GGML_USE_CUBLAS_SYCL
4747
#endif
4848

49+
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
50+
#define GGML_USE_CUBLAS_SYCL_VULKAN
51+
#endif
52+
4953
int32_t get_num_physical_cores() {
5054
#ifdef __linux__
5155
// enumerate the set of thread siblings, num entries is num cores
@@ -399,6 +403,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
399403
break;
400404
}
401405
sparams.penalty_present = std::stof(argv[i]);
406+
} else if (arg == "--dynatemp-range") {
407+
if (++i >= argc) {
408+
invalid_param = true;
409+
break;
410+
}
411+
sparams.dynatemp_range = std::stof(argv[i]);
412+
} else if (arg == "--dynatemp-exp") {
413+
if (++i >= argc) {
414+
invalid_param = true;
415+
break;
416+
}
417+
sparams.dynatemp_exponent = std::stof(argv[i]);
402418
} else if (arg == "--mirostat") {
403419
if (++i >= argc) {
404420
invalid_param = true;
@@ -648,8 +664,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
648664
params.tensor_split[i] = 0.0f;
649665
}
650666
}
651-
#ifndef GGML_USE_CUBLAS_SYCL
652-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
667+
#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
668+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
653669
#endif // GGML_USE_CUBLAS_SYCL
654670
} else if (arg == "--no-mmap") {
655671
params.use_mmap = false;
@@ -942,6 +958,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
942958
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
943959
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
944960
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
961+
printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
962+
printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
945963
printf(" --mirostat N use Mirostat sampling.\n");
946964
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
947965
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);

convert-hf-to-gguf.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
2323
import gguf
2424

25+
from convert import HfVocab
26+
2527

2628
# check for any of the given keys in the dictionary and return the value of the first key found
2729
def get_key_opts(d, keys):
@@ -205,6 +207,8 @@ def from_model_architecture(model_architecture):
205207
return OrionModel
206208
if model_architecture == "InternLM2ForCausalLM":
207209
return InternLM2Model
210+
if model_architecture == "MiniCPMForCausalLM":
211+
return MiniCPMModel
208212
return Model
209213

210214
def _is_model_safetensors(self) -> bool:
@@ -258,6 +262,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
258262
return gguf.MODEL_ARCH.ORION
259263
if arch == "InternLM2ForCausalLM":
260264
return gguf.MODEL_ARCH.INTERNLM2
265+
if arch == "MiniCPMForCausalLM":
266+
return gguf.MODEL_ARCH.MINICPM
261267

262268
raise NotImplementedError(f'Architecture "{arch}" not supported!')
263269

@@ -402,6 +408,31 @@ def _set_vocab_sentencepiece(self):
402408
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
403409
special_vocab.add_to_gguf(self.gguf_writer)
404410

411+
def _set_vocab_hf(self):
412+
path = self.dir_model
413+
added_tokens_path = self.dir_model
414+
vocab = HfVocab(
415+
path, added_tokens_path if added_tokens_path.exists() else None
416+
)
417+
tokens = []
418+
scores = []
419+
toktypes = []
420+
421+
for text, score, toktype in vocab.all_tokens():
422+
tokens.append(text)
423+
scores.append(score)
424+
toktypes.append(toktype)
425+
426+
assert len(tokens) == vocab.vocab_size
427+
428+
self.gguf_writer.add_tokenizer_model("llama")
429+
self.gguf_writer.add_token_list(tokens)
430+
self.gguf_writer.add_token_scores(scores)
431+
self.gguf_writer.add_token_types(toktypes)
432+
433+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
434+
special_vocab.add_to_gguf(self.gguf_writer)
435+
405436

406437
class GPTNeoXModel(Model):
407438
def set_gguf_parameters(self):
@@ -1041,6 +1072,24 @@ def set_vocab(self):
10411072
self._set_vocab_sentencepiece()
10421073

10431074

1075+
class MiniCPMModel(Model):
1076+
def set_gguf_parameters(self):
1077+
block_count = self.hparams["num_hidden_layers"]
1078+
self.gguf_writer.add_name("MiniCPM")
1079+
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1080+
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1081+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1082+
self.gguf_writer.add_block_count(block_count)
1083+
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1084+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1085+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1086+
self.gguf_writer.add_file_type(self.ftype)
1087+
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1088+
1089+
def set_vocab(self):
1090+
self._set_vocab_hf()
1091+
1092+
10441093
class QwenModel(Model):
10451094
@staticmethod
10461095
def token_bytes_to_string(b):
@@ -1416,8 +1465,32 @@ def set_vocab(self):
14161465
self.gguf_writer.add_add_space_prefix(add_prefix)
14171466

14181467
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
1468+
old_eos = special_vocab.special_token_ids["eos"]
1469+
if "chat" in os.path.basename(self.dir_model.absolute()):
1470+
# For the chat model, we replace the eos with '<|im_end|>'.
1471+
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
1472+
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
1473+
in chat mode so that the conversation can end normally.")
1474+
14191475
special_vocab.add_to_gguf(self.gguf_writer)
14201476

1477+
def _try_get_sft_eos(self, tokenizer):
1478+
unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
1479+
im_end_list = tokenizer.encode('<|im_end|>')
1480+
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
1481+
if len(unused_145_list) == 1:
1482+
eos_token = unused_145_list[0]
1483+
if len(im_end_list) == 1:
1484+
eos_token = im_end_list[0]
1485+
return eos_token
1486+
1487+
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
1488+
if n_head_kv is not None and n_head != n_head_kv:
1489+
n_head = n_head_kv
1490+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1491+
.swapaxes(1, 2)
1492+
.reshape(weights.shape))
1493+
14211494
def set_gguf_parameters(self):
14221495
self.gguf_writer.add_name("InternLM2")
14231496
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
@@ -1486,8 +1559,9 @@ def write_tensors(self):
14861559
qkv = data_torch
14871560
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
14881561
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
1489-
q = rearrange(q, " o g n i -> o (g n i)").T
1490-
k = rearrange(k, " o g n i -> o (g n i)").T
1562+
# The model weights of q and k equire additional reshape.
1563+
q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
1564+
k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
14911565
v = rearrange(v, " o g n i -> o (g n i)").T
14921566
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
14931567
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)

convert.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -334,9 +334,9 @@ def load(model_plus: ModelPlus) -> Params:
334334
class BpeVocab:
335335
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
336336
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
337-
try:
337+
if isinstance(self.bpe_tokenizer.get('model'), dict):
338338
self.vocab = self.bpe_tokenizer["model"]["vocab"]
339-
except KeyError:
339+
else:
340340
self.vocab = self.bpe_tokenizer
341341
added_tokens: dict[str, int]
342342
if fname_added_tokens is not None:
@@ -515,10 +515,14 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
515515

516516
# Yield token text, score, and type
517517
yield token_text, self.get_token_score(token_id), self.get_token_type(
518-
token_id, self.special_ids # Reuse already stored special IDs
518+
token_id, token_text, self.special_ids # Reuse already stored special IDs
519519
)
520520

521-
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
521+
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
522+
# Special case for byte tokens
523+
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
524+
return gguf.TokenType.BYTE
525+
522526
# Determine token type based on whether it's a special token
523527
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
524528

@@ -530,7 +534,7 @@ def get_token_score(self, token_id: int) -> float:
530534
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
531535
for text in self.added_tokens_list:
532536
if text in self.specials:
533-
toktype = self.get_token_type(self.specials[text], self.special_ids)
537+
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
534538
score = self.get_token_score(self.specials[text])
535539
else:
536540
toktype = gguf.TokenType.USER_DEFINED

examples/llava/llava-cli.cpp

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
3434

3535
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
3636
std::string str2 = str;
37-
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
37+
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
3838
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
3939
return true;
4040
}
@@ -152,20 +152,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
152152
size_t image_pos = prompt.find("<image>");
153153
if (image_pos != std::string::npos) {
154154
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
155-
156155
system_prompt = prompt.substr(0, image_pos);
157156
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
158-
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
159-
size_t pos = 0;
160-
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
161-
user_prompt.replace(pos, 2, "\n");
162-
pos += 1; // Advance past the replaced newline
163-
}
164-
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
165-
system_prompt.replace(pos, 2, "\n");
166-
pos += 1; // Advance past the replaced newline
167-
}
168-
169157
printf("system_prompt: %s\n", system_prompt.c_str());
170158
printf("user_prompt: %s\n", user_prompt.c_str());
171159
} else {

0 commit comments

Comments
 (0)