Skip to content

Commit 480440a

Browse files
committed
Merge branch 'master' into gg/ggml-common-decl
2 parents 5e5c5e7 + fb215c3 commit 480440a

28 files changed

+1941
-444
lines changed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,10 @@ ifdef LLAMA_SERVER_VERBOSE
201201
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202202
endif
203203

204+
ifdef LLAMA_SERVER_SSL
205+
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
206+
MK_LDFLAGS += -lssl -lcrypto
207+
endif
204208

205209
ifdef LLAMA_CODE_COVERAGE
206210
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13+
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
1314
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
1415
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
1516

@@ -110,6 +111,7 @@ Typically finetunes of the base models below are supported as well.
110111
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
111112
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
112113
- [x] [Gemma](https://ai.google.dev/gemma)
114+
- [x] [Mamba](https://github.com/state-spaces/mamba)
113115

114116
**Multimodal models:**
115117

common/common.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,6 +1288,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12881288

12891289
cparams.n_ctx = params.n_ctx;
12901290
cparams.n_batch = params.n_batch;
1291+
cparams.n_parallel = params.n_parallel;
12911292
cparams.n_threads = params.n_threads;
12921293
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
12931294
cparams.seed = params.seed;
@@ -1851,3 +1852,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18511852

18521853
printf("\n=== Done dumping\n");
18531854
}
1855+
1856+
void llama_embd_normalize(const float * inp, float * out, int n) {
1857+
double sum = 0.0;
1858+
for (int i = 0; i < n; i++) {
1859+
sum += inp[i] * inp[i];
1860+
}
1861+
sum = sqrt(sum);
1862+
1863+
const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
1864+
1865+
for (int i = 0; i < n; i++) {
1866+
out[i] = inp[i] * norm;
1867+
}
1868+
}
1869+

common/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
260260

261261
// Dump the KV cache view showing individual sequences in each cell (long output).
262262
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
263+
264+
//
265+
// Embedding utils
266+
//
267+
268+
void llama_embd_normalize(const float * inp, float * out, int n);
269+

convert-hf-to-gguf.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,124 @@ class StarCoder2Model(Model):
18471847
model_arch = gguf.MODEL_ARCH.STARCODER2
18481848

18491849

1850+
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
1851+
class MambaModel(Model):
1852+
model_arch = gguf.MODEL_ARCH.MAMBA
1853+
1854+
def set_vocab(self):
1855+
vocab_size = self.hparams["vocab_size"]
1856+
# Round vocab size to next multiple of 8
1857+
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
1858+
# pad using ceiling division
1859+
# ref: https://stackoverflow.com/a/17511341/22827863
1860+
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
1861+
self.hparams["vocab_size"] = vocab_size
1862+
1863+
if (self.dir_model / "tokenizer.json").is_file():
1864+
self._set_vocab_gpt2()
1865+
else:
1866+
# Use the GPT-NeoX tokenizer when no tokenizer files are present
1867+
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
1868+
print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
1869+
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
1870+
1871+
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
1872+
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
1873+
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
1874+
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
1875+
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
1876+
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
1877+
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
1878+
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
1879+
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
1880+
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
1881+
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
1882+
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
1883+
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
1884+
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
1885+
1886+
def set_gguf_parameters(self):
1887+
d_model = self.find_hparam(["hidden_size", "d_model"])
1888+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
1889+
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
1890+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
1891+
# ceiling division
1892+
# ref: https://stackoverflow.com/a/17511341/22827863
1893+
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
1894+
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
1895+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
1896+
1897+
# Fail early for models which don't have a block expansion factor of 2
1898+
assert d_inner == 2 * d_model
1899+
1900+
self.gguf_writer.add_name(self.dir_model.name)
1901+
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
1902+
self.gguf_writer.add_embedding_length(d_model)
1903+
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
1904+
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
1905+
self.gguf_writer.add_block_count(self.hparams["n_layer"])
1906+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
1907+
self.gguf_writer.add_ssm_inner_size(d_inner)
1908+
self.gguf_writer.add_ssm_state_size(d_state)
1909+
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
1910+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
1911+
self.gguf_writer.add_file_type(self.ftype)
1912+
1913+
def write_tensors(self):
1914+
block_count = self.hparams["n_layer"]
1915+
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1916+
1917+
tok_embd = None
1918+
tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
1919+
output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
1920+
1921+
for name, data_torch in self.get_tensors():
1922+
old_dtype = data_torch.dtype
1923+
1924+
# convert any unsupported data types to float32
1925+
if data_torch.dtype not in (torch.float16, torch.float32):
1926+
data_torch = data_torch.to(torch.float32)
1927+
1928+
# map tensor names
1929+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1930+
if new_name is None:
1931+
print(f"Can not map tensor {name!r}")
1932+
sys.exit()
1933+
1934+
if name.endswith(".A_log"):
1935+
print("A_log --> A ==> " + new_name)
1936+
data_torch = -torch.exp(data_torch)
1937+
1938+
# assuming token_embd.weight is seen before output.weight
1939+
if tok_embd is not None and new_name == output_name:
1940+
if torch.equal(tok_embd, data_torch):
1941+
print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
1942+
continue
1943+
if new_name == tok_embd_name:
1944+
tok_embd = data_torch
1945+
1946+
data = data_torch.squeeze().numpy()
1947+
1948+
n_dims = len(data.shape)
1949+
data_dtype = data.dtype
1950+
1951+
# if f32 desired, convert any float16 to float32
1952+
if self.ftype == 0 and data_dtype == np.float16:
1953+
data = data.astype(np.float32)
1954+
1955+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1956+
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1957+
data = data.astype(np.float32)
1958+
1959+
# if f16 desired, convert big float32 2-dim weight tensors to float16
1960+
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
1961+
data = data.astype(np.float16)
1962+
1963+
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1964+
1965+
self.gguf_writer.add_tensor(new_name, data)
1966+
1967+
18501968
###### CONVERSION LOGIC ######
18511969

18521970

examples/batched-bench/batched-bench.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ int main(int argc, char ** argv) {
105105
ctx_params.n_threads = params.n_threads;
106106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
107107

108+
// ensure enough sequences are available
109+
ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
110+
108111
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
109112

110113
if (ctx == NULL) {
@@ -174,10 +177,10 @@ int main(int argc, char ** argv) {
174177

175178
llama_batch_clear(batch);
176179

177-
const int n_tokens = is_pp_shared ? pp : pl*pp;
178-
179-
for (int i = 0; i < n_tokens; ++i) {
180-
llama_batch_add(batch, 0, i, { 0 }, false);
180+
for (int i = 0; i < pp; ++i) {
181+
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
182+
llama_batch_add(batch, 0, i, { j }, false);
183+
}
181184
}
182185
batch.logits[batch.n_tokens - 1] = true;
183186

@@ -192,7 +195,7 @@ int main(int argc, char ** argv) {
192195

193196
if (is_pp_shared) {
194197
for (int32_t i = 1; i < pl; ++i) {
195-
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
198+
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
196199
}
197200
}
198201

examples/batched/batched.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ int main(int argc, char ** argv) {
8080
ctx_params.seed = 1234;
8181
ctx_params.n_ctx = n_kv_req;
8282
ctx_params.n_batch = std::max(n_len, n_parallel);
83+
ctx_params.n_parallel = n_parallel;
8384
ctx_params.n_threads = params.n_threads;
8485
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
8586

@@ -132,7 +133,7 @@ int main(int argc, char ** argv) {
132133
// assign the system KV cache to all parallel sequences
133134
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
134135
for (int32_t i = 1; i < n_parallel; ++i) {
135-
llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
136+
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
136137
}
137138

138139
if (n_parallel > 1) {

examples/embedding/embedding.cpp

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,6 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
2323
}
2424
}
2525

26-
static void normalize(const float * vec, float * out, int n) {
27-
float norm = 0;
28-
for (int i = 0; i < n; i++) {
29-
norm += vec[i] * vec[i];
30-
}
31-
norm = sqrt(norm);
32-
for (int i = 0; i < n; i++) {
33-
out[i] = vec[i] / norm;
34-
}
35-
}
36-
3726
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
3827
// clear previous kv_cache values (irrelevant for embeddings)
3928
llama_kv_cache_clear(ctx);
@@ -44,7 +33,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
4433
fprintf(stderr, "%s : failed to decode\n", __func__);
4534
}
4635

47-
// normalize on copy
4836
for (int i = 0; i < batch.n_tokens; i++) {
4937
if (!batch.logits[i]) {
5038
continue;
@@ -61,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
6149
}
6250

6351
float * out = output + batch.seq_id[i][0] * n_embd;
64-
normalize(embd, out, n_embd);
52+
llama_embd_normalize(embd, out, n_embd);
6553
}
6654
}
6755

examples/parallel/parallel.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ int main(int argc, char ** argv) {
107107
// number of simultaneous "clients" to simulate
108108
const int32_t n_clients = params.n_parallel;
109109

110+
// dedicate one sequence to the system prompt
111+
params.n_parallel += 1;
112+
110113
// requests to simulate
111114
const int32_t n_seq = params.n_sequences;
112115

@@ -196,8 +199,8 @@ int main(int argc, char ** argv) {
196199
}
197200

198201
// assign the system KV cache to all parallel sequences
199-
for (int32_t i = 1; i < n_clients; ++i) {
200-
llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
202+
for (int32_t i = 1; i <= n_clients; ++i) {
203+
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
201204
}
202205

203206
LOG_TEE("\n");
@@ -221,15 +224,17 @@ int main(int argc, char ** argv) {
221224

222225
client.i_batch = batch.n_tokens;
223226

224-
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
227+
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
225228

226229
client.n_decoded += 1;
227230
}
228231

229232
if (batch.n_tokens == 0) {
230233
// all sequences have ended - clear the entire KV cache
231-
for (int i = 0; i < n_clients; ++i) {
232-
llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
234+
for (int i = 1; i <= n_clients; ++i) {
235+
llama_kv_cache_seq_rm(ctx, i, -1, -1);
236+
// but keep the system prompt
237+
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
233238
}
234239

235240
LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -255,7 +260,7 @@ int main(int argc, char ** argv) {
255260
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
256261

257262
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
258-
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
263+
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
259264
}
260265

261266
// extract the logits only for the last token
@@ -366,7 +371,8 @@ int main(int argc, char ** argv) {
366371
}
367372

368373
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
369-
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
374+
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
375+
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
370376

371377
const auto t_main_end = ggml_time_us();
372378

examples/perplexity/perplexity.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -809,7 +809,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
809809
const int n_batch = params.n_batch;
810810

811811
const int max_tasks_per_batch = 32;
812-
const int max_seq = 4*max_tasks_per_batch;
812+
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
813813

814814
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
815815

@@ -1086,7 +1086,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
10861086
const int n_batch = params.n_batch;
10871087

10881088
const int max_tasks_per_batch = 128;
1089-
const int max_seq = 2*max_tasks_per_batch;
1089+
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
10901090

10911091
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
10921092

@@ -1438,7 +1438,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
14381438
const int n_batch = params.n_batch;
14391439

14401440
const int max_tasks_per_batch = 32;
1441-
const int max_seq = 4*max_tasks_per_batch;
1441+
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
14421442

14431443
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
14441444

@@ -1815,6 +1815,9 @@ int main(int argc, char ** argv) {
18151815
llama_model * model;
18161816
llama_context * ctx;
18171817

1818+
// ensure there's at least enough seq_ids for HellaSwag
1819+
params.n_parallel = std::max(4, params.n_parallel);
1820+
18181821
// load the model and apply lora adapter, if any
18191822
std::tie(model, ctx) = llama_init_from_gpt_params(params);
18201823
if (model == NULL) {

examples/server/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
set(TARGET server)
22
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
3+
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
34
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
45
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
56
install(TARGETS ${TARGET} RUNTIME)
67
target_compile_definitions(${TARGET} PRIVATE
78
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
89
)
910
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
11+
if (LLAMA_SERVER_SSL)
12+
find_package(OpenSSL REQUIRED)
13+
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
14+
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
15+
endif()
1016
if (WIN32)
1117
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
1218
endif()

0 commit comments

Comments
 (0)