Skip to content

Commit 609ec7e

Browse files
committed
llama : functions -> methods (#11110)
1 parent c3f9d25 commit 609ec7e

14 files changed

+4602
-4498
lines changed

src/llama-adapter.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "llama-adapter.h"
22

3+
#include "llama-mmap.h"
34
#include "llama-model.h"
45

56
#include <algorithm>
@@ -62,7 +63,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
6263
cvec.tensors.reserve(hparams.n_layer);
6364
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
6465
for (size_t il = 1; il < hparams.n_layer; il++) {
65-
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
66+
ggml_backend_buffer_type_t buft = model.select_buft(il);
6667
ggml_context * ctx = ctx_for_buft(buft);
6768
if (!ctx) {
6869
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
@@ -262,7 +263,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
262263
}
263264

264265
// device buft and device ctx
265-
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
266+
const auto * model_tensor = model.get_tensor(name.c_str());
266267
if (!model_tensor) {
267268
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
268269
}

src/llama-context.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "llama-context.h"
22

3+
#include "llama-mmap.h"
4+
35
#include <cassert>
46
#include <cmath>
57
#include <cstring>
@@ -504,7 +506,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
504506

505507
auto * buft = ggml_backend_cpu_buffer_type();
506508
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
507-
auto * output_dev = lctx.model.dev_output.dev;
509+
auto * output_dev = lctx.model.dev_output();
508510
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
509511
if (output_dev_host_buft) {
510512
buft = output_dev_host_buft;

src/llama-grammar.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
10921092

10931093
for (size_t i = 0; i < cur_p->size; ++i) {
10941094
const llama_token id = cur_p->data[i].id;
1095-
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
1095+
const std::string & piece = grammar.vocab->token_to_piece(id);
10961096

1097-
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
1097+
if (grammar.vocab->is_eog(id)) {
10981098
if (!allow_eog) {
10991099
cur_p->data[i].logit = -INFINITY;
11001100
}
@@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
11151115
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
11161116
GGML_ASSERT(grammar.vocab != nullptr);
11171117

1118-
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
1118+
if (grammar.vocab->is_eog(token)) {
11191119
for (const auto & stack : grammar.stacks) {
11201120
if (stack.empty()) {
11211121
return;
@@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
11241124
GGML_ABORT("fatal error");
11251125
}
11261126

1127-
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
1127+
const std::string & piece = grammar.vocab->token_to_piece(token);
11281128

11291129
// Note terminating 0 in decoded string
11301130
const auto decoded = decode_utf8(piece, grammar.partial_utf8);

src/llama-kv-cache.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ bool llama_kv_cache_init(
7979

8080
ggml_backend_buffer_type_t buft;
8181
if (offload) {
82-
auto * dev = model.dev_layer.at(i).dev;
82+
auto * dev = model.dev_layer(i);
8383
buft = ggml_backend_dev_buffer_type(dev);
8484
} else {
8585
buft = ggml_backend_cpu_buffer_type();

src/llama-mmap.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
// TODO: consider moving to llama-impl.h if needed in more places
3737
#if defined(_WIN32)
38-
std::string llama_format_win_err(DWORD err) {
38+
static std::string llama_format_win_err(DWORD err) {
3939
LPSTR buf;
4040
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
4141
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);

src/llama-model-loader.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
#include <cstring>
88
#include <future>
99

10+
static const size_t kiB = 1024;
11+
static const size_t MiB = 1024*kiB;
12+
static const size_t GiB = 1024*MiB;
13+
1014
const char * llama_file_version_name(llama_fver version) {
1115
switch (version) {
1216
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
1721
return "unknown";
1822
}
1923

24+
static std::string llama_model_ftype_name(llama_ftype ftype) {
25+
if (ftype & LLAMA_FTYPE_GUESSED) {
26+
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
27+
}
28+
29+
switch (ftype) {
30+
case LLAMA_FTYPE_ALL_F32: return "all F32";
31+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
32+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
33+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
34+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
35+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
41+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
42+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
43+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
44+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
45+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
46+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
47+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
48+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
49+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
50+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
51+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
52+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
53+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
54+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
55+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
56+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
57+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
58+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
59+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
60+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
61+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
62+
63+
default: return "unknown, may not work";
64+
}
65+
}
66+
2067
namespace GGUFMeta {
2168
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
2269
struct GKV_Base_Type {
@@ -1009,3 +1056,17 @@ bool llama_model_loader::load_all_data(
10091056

10101057
return true;
10111058
}
1059+
1060+
std::string llama_model_loader::ftype_name() const {
1061+
return llama_model_ftype_name(ftype);
1062+
}
1063+
1064+
void llama_model_loader::print_info() const {
1065+
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1066+
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1067+
if (n_bytes < GiB) {
1068+
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1069+
} else {
1070+
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1071+
}
1072+
}

src/llama-model-loader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,8 @@ struct llama_model_loader {
155155
llama_mlocks * lmlocks,
156156
llama_progress_callback progress_callback,
157157
void * progress_callback_user_data);
158+
159+
std::string ftype_name() const;
160+
161+
void print_info() const;
158162
};

0 commit comments

Comments
 (0)