Skip to content

Commit ba48e37

Browse files
committed
cont
ggml-ci
1 parent 7035c79 commit ba48e37

File tree

7 files changed

+332
-339
lines changed

7 files changed

+332
-339
lines changed

src/llama-batch.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,3 +304,65 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
304304
batch.logits = logits.data();
305305
}
306306
}
307+
308+
//
309+
// interface implementation
310+
//
311+
312+
struct llama_batch llama_batch_get_one(
313+
llama_token * tokens,
314+
int32_t n_tokens) {
315+
return {
316+
/*n_tokens =*/ n_tokens,
317+
/*tokens =*/ tokens,
318+
/*embd =*/ nullptr,
319+
/*pos =*/ nullptr,
320+
/*n_seq_id =*/ nullptr,
321+
/*seq_id =*/ nullptr,
322+
/*logits =*/ nullptr,
323+
};
324+
}
325+
326+
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
327+
llama_batch batch = {
328+
/*n_tokens =*/ 0,
329+
/*tokens =*/ nullptr,
330+
/*embd =*/ nullptr,
331+
/*pos =*/ nullptr,
332+
/*n_seq_id =*/ nullptr,
333+
/*seq_id =*/ nullptr,
334+
/*logits =*/ nullptr,
335+
};
336+
337+
if (embd) {
338+
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
339+
} else {
340+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
341+
}
342+
343+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
344+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
345+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
346+
for (int i = 0; i < n_tokens_alloc; ++i) {
347+
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
348+
}
349+
batch.seq_id[n_tokens_alloc] = nullptr;
350+
351+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
352+
353+
return batch;
354+
}
355+
356+
void llama_batch_free(struct llama_batch batch) {
357+
if (batch.token) free(batch.token);
358+
if (batch.embd) free(batch.embd);
359+
if (batch.pos) free(batch.pos);
360+
if (batch.n_seq_id) free(batch.n_seq_id);
361+
if (batch.seq_id) {
362+
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
363+
free(batch.seq_id[i]);
364+
}
365+
free(batch.seq_id);
366+
}
367+
if (batch.logits) free(batch.logits);
368+
}

src/llama-batch.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,3 @@ struct llama_batch_allocr {
8686
// optionally fulfill the batch returned by llama_batch_get_one
8787
llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
8888
};
89-

src/llama-impl.cpp

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "llama.h"
44

55
#include <cstdarg>
6+
#include <vector>
67

78
struct llama_logger_state {
89
ggml_log_callback log_callback = llama_log_callback_default;
@@ -19,23 +20,6 @@ time_meas::~time_meas() {
1920
}
2021
}
2122

22-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
23-
if (search.empty()) {
24-
return;
25-
}
26-
std::string builder;
27-
builder.reserve(s.length());
28-
size_t pos = 0;
29-
size_t last_pos = 0;
30-
while ((pos = s.find(search, last_pos)) != std::string::npos) {
31-
builder.append(s, last_pos, pos - last_pos);
32-
builder.append(replace);
33-
last_pos = pos + search.length();
34-
}
35-
builder.append(s, last_pos, std::string::npos);
36-
s = std::move(builder);
37-
}
38-
3923
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
4024
ggml_log_set(log_callback, user_data);
4125
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
@@ -72,3 +56,35 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
7256
fputs(text, stderr);
7357
fflush(stderr);
7458
}
59+
60+
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
61+
if (search.empty()) {
62+
return;
63+
}
64+
std::string builder;
65+
builder.reserve(s.length());
66+
size_t pos = 0;
67+
size_t last_pos = 0;
68+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
69+
builder.append(s, last_pos, pos - last_pos);
70+
builder.append(replace);
71+
last_pos = pos + search.length();
72+
}
73+
builder.append(s, last_pos, std::string::npos);
74+
s = std::move(builder);
75+
}
76+
77+
std::string format(const char * fmt, ...) {
78+
va_list ap;
79+
va_list ap2;
80+
va_start(ap, fmt);
81+
va_copy(ap2, ap);
82+
int size = vsnprintf(NULL, 0, fmt, ap);
83+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
84+
std::vector<char> buf(size + 1);
85+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
86+
GGML_ASSERT(size2 == size);
87+
va_end(ap2);
88+
va_end(ap);
89+
return std::string(buf.data(), size);
90+
}

src/llama-impl.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#pragma once
22

3-
#include "ggml.h"
3+
#include "ggml.h" // for ggml_log_level
44

55
#include <string>
66

@@ -22,10 +22,6 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
2222
void llama_log_internal (ggml_log_level level, const char * format, ...);
2323
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
2424

25-
// TODO: rename to llama_format ?
26-
LLAMA_ATTRIBUTE_FORMAT(1, 2)
27-
std::string format(const char * fmt, ...);
28-
2925
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
3026
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
3127
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
@@ -47,3 +43,7 @@ struct time_meas {
4743
};
4844

4945
void replace_all(std::string & s, const std::string & search, const std::string & replace);
46+
47+
// TODO: rename to llama_format ?
48+
LLAMA_ATTRIBUTE_FORMAT(1, 2)
49+
std::string format(const char * fmt, ...);

src/llama-model.cpp

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,225 @@ struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, co
189189

190190
return it->second;
191191
}
192+
193+
size_t llama_model_max_nodes(const llama_model & model) {
194+
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
195+
}
196+
197+
//
198+
// interface implementation
199+
//
200+
201+
struct llama_model_params llama_model_default_params() {
202+
struct llama_model_params result = {
203+
/*.devices =*/ nullptr,
204+
/*.n_gpu_layers =*/ 0,
205+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
206+
/*.main_gpu =*/ 0,
207+
/*.tensor_split =*/ nullptr,
208+
/*.rpc_servers =*/ nullptr,
209+
/*.progress_callback =*/ nullptr,
210+
/*.progress_callback_user_data =*/ nullptr,
211+
/*.kv_overrides =*/ nullptr,
212+
/*.vocab_only =*/ false,
213+
/*.use_mmap =*/ true,
214+
/*.use_mlock =*/ false,
215+
/*.check_tensors =*/ false,
216+
};
217+
218+
#ifdef GGML_USE_METAL
219+
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
220+
result.n_gpu_layers = 999;
221+
#endif
222+
223+
return result;
224+
}
225+
226+
void llama_free_model(struct llama_model * model) {
227+
delete model;
228+
}
229+
230+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
231+
return model->vocab.type;
232+
}
233+
234+
int32_t llama_n_vocab(const struct llama_model * model) {
235+
return model->hparams.n_vocab;
236+
}
237+
238+
int32_t llama_n_ctx_train(const struct llama_model * model) {
239+
return model->hparams.n_ctx_train;
240+
}
241+
242+
int32_t llama_n_embd(const struct llama_model * model) {
243+
return model->hparams.n_embd;
244+
}
245+
246+
int32_t llama_n_layer(const struct llama_model * model) {
247+
return model->hparams.n_layer;
248+
}
249+
250+
int32_t llama_n_head(const struct llama_model * model) {
251+
return model->hparams.n_head();
252+
}
253+
254+
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
255+
switch (model->arch) {
256+
// these models do not use RoPE
257+
case LLM_ARCH_GPT2:
258+
case LLM_ARCH_GPTJ:
259+
case LLM_ARCH_MPT:
260+
case LLM_ARCH_REFACT:
261+
case LLM_ARCH_BLOOM:
262+
case LLM_ARCH_MAMBA:
263+
case LLM_ARCH_JINA_BERT_V2:
264+
case LLM_ARCH_T5:
265+
case LLM_ARCH_T5ENCODER:
266+
case LLM_ARCH_JAIS:
267+
case LLM_ARCH_RWKV6:
268+
case LLM_ARCH_WAVTOKENIZER_DEC:
269+
return LLAMA_ROPE_TYPE_NONE;
270+
271+
// use what we call a normal RoPE, operating on pairs of consecutive head values
272+
case LLM_ARCH_LLAMA:
273+
case LLM_ARCH_DECI:
274+
case LLM_ARCH_BAICHUAN:
275+
case LLM_ARCH_STARCODER:
276+
case LLM_ARCH_PLAMO:
277+
case LLM_ARCH_ORION:
278+
case LLM_ARCH_INTERNLM2:
279+
case LLM_ARCH_MINICPM:
280+
case LLM_ARCH_XVERSE:
281+
case LLM_ARCH_COMMAND_R:
282+
case LLM_ARCH_OLMO:
283+
case LLM_ARCH_ARCTIC:
284+
case LLM_ARCH_DEEPSEEK:
285+
case LLM_ARCH_DEEPSEEK2:
286+
case LLM_ARCH_CHATGLM:
287+
case LLM_ARCH_GRANITE:
288+
case LLM_ARCH_GRANITE_MOE:
289+
case LLM_ARCH_CHAMELEON:
290+
return LLAMA_ROPE_TYPE_NORM;
291+
292+
// the pairs of head values are offset by n_rot/2
293+
case LLM_ARCH_FALCON:
294+
case LLM_ARCH_GROK:
295+
case LLM_ARCH_DBRX:
296+
case LLM_ARCH_BERT:
297+
case LLM_ARCH_NOMIC_BERT:
298+
case LLM_ARCH_STABLELM:
299+
case LLM_ARCH_BITNET:
300+
case LLM_ARCH_QWEN:
301+
case LLM_ARCH_QWEN2:
302+
case LLM_ARCH_QWEN2MOE:
303+
case LLM_ARCH_OLMO2:
304+
case LLM_ARCH_OLMOE:
305+
case LLM_ARCH_PHI2:
306+
case LLM_ARCH_PHI3:
307+
case LLM_ARCH_GEMMA:
308+
case LLM_ARCH_GEMMA2:
309+
case LLM_ARCH_STARCODER2:
310+
case LLM_ARCH_OPENELM:
311+
case LLM_ARCH_GPTNEOX:
312+
case LLM_ARCH_CODESHELL:
313+
case LLM_ARCH_NEMOTRON:
314+
case LLM_ARCH_EXAONE:
315+
case LLM_ARCH_MINICPM3:
316+
return LLAMA_ROPE_TYPE_NEOX;
317+
318+
case LLM_ARCH_QWEN2VL:
319+
return LLAMA_ROPE_TYPE_MROPE;
320+
321+
// all model arches should be listed explicitly here
322+
case LLM_ARCH_UNKNOWN:
323+
GGML_ABORT("unknown architecture");
324+
}
325+
326+
return LLAMA_ROPE_TYPE_NONE;
327+
}
328+
329+
float llama_rope_freq_scale_train(const struct llama_model * model) {
330+
return model->hparams.rope_freq_scale_train;
331+
}
332+
333+
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
334+
const auto & it = model->gguf_kv.find(key);
335+
if (it == model->gguf_kv.end()) {
336+
if (buf_size > 0) {
337+
buf[0] = '\0';
338+
}
339+
return -1;
340+
}
341+
return snprintf(buf, buf_size, "%s", it->second.c_str());
342+
}
343+
344+
int32_t llama_model_meta_count(const struct llama_model * model) {
345+
return (int)model->gguf_kv.size();
346+
}
347+
348+
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
349+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
350+
if (buf_size > 0) {
351+
buf[0] = '\0';
352+
}
353+
return -1;
354+
}
355+
auto it = model->gguf_kv.begin();
356+
std::advance(it, i);
357+
return snprintf(buf, buf_size, "%s", it->first.c_str());
358+
}
359+
360+
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
361+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
362+
if (buf_size > 0) {
363+
buf[0] = '\0';
364+
}
365+
return -1;
366+
}
367+
auto it = model->gguf_kv.begin();
368+
std::advance(it, i);
369+
return snprintf(buf, buf_size, "%s", it->second.c_str());
370+
}
371+
372+
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
373+
return snprintf(buf, buf_size, "%s %s %s",
374+
llama_model_arch_name (*model).c_str(),
375+
llama_model_type_name (*model).c_str(),
376+
llama_model_ftype_name(*model).c_str());
377+
}
378+
379+
uint64_t llama_model_size(const struct llama_model * model) {
380+
return model->n_bytes;
381+
}
382+
383+
uint64_t llama_model_n_params(const struct llama_model * model) {
384+
return model->n_elements;
385+
}
386+
387+
bool llama_model_has_encoder(const struct llama_model * model) {
388+
switch (model->arch) {
389+
case LLM_ARCH_T5: return true;
390+
case LLM_ARCH_T5ENCODER: return true;
391+
default: return false;
392+
}
393+
}
394+
395+
bool llama_model_has_decoder(const struct llama_model * model) {
396+
switch (model->arch) {
397+
case LLM_ARCH_T5ENCODER: return false;
398+
default: return true;
399+
}
400+
}
401+
402+
llama_token llama_model_decoder_start_token(const struct llama_model * model) {
403+
return model->hparams.dec_start_token_id;
404+
}
405+
406+
bool llama_model_is_recurrent(const struct llama_model * model) {
407+
switch (model->arch) {
408+
case LLM_ARCH_MAMBA: return true;
409+
case LLM_ARCH_RWKV6: return true;
410+
default: return false;
411+
}
412+
}
413+

src/llama-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,3 +377,4 @@ ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, in
377377
// used by llama_adapter_lora
378378
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
379379

380+
size_t llama_model_max_nodes(const llama_model & model);

0 commit comments

Comments
 (0)