Skip to content

Commit 4eaaace

Browse files
committed
llama : ggml_graph based defrag implementation
ggml-ci
1 parent 65323bc commit 4eaaace

File tree

1 file changed

+94
-18
lines changed

1 file changed

+94
-18
lines changed

llama.cpp

Lines changed: 94 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5111,6 +5111,53 @@ struct llm_build_context {
51115111
return gf;
51125112
}
51135113

5114+
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5115+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5116+
5117+
for (int il = 0; il < n_layer; ++il) {
5118+
for (int i = 0; i < n_kv; ++i) {
5119+
const int id = ids[i];
5120+
5121+
if (i == id || id == n_kv) {
5122+
continue;
5123+
}
5124+
5125+
int nm = 1;
5126+
5127+
while (i + nm < n_kv && (int) ids[i + nm] == id + nm) {
5128+
nm++;
5129+
}
5130+
5131+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5132+
n_embd_k_gqa, nm,
5133+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5134+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5135+
5136+
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5137+
n_embd_k_gqa, nm,
5138+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5139+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5140+
5141+
ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5142+
nm, n_embd_v_gqa,
5143+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5144+
ggml_row_size(kv_self.v_l[il]->type, i));
5145+
5146+
ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5147+
nm, n_embd_v_gqa,
5148+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5149+
ggml_row_size(kv_self.v_l[il]->type, id));
5150+
5151+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5152+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5153+
5154+
i += nm - 1;
5155+
}
5156+
}
5157+
5158+
return gf;
5159+
}
5160+
51145161
struct ggml_cgraph * build_llama() {
51155162
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
51165163

@@ -7505,6 +7552,23 @@ struct llm_build_context {
75057552
}
75067553
};
75077554

7555+
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7556+
llama_batch dummy;
7557+
dummy.n_tokens = 0;
7558+
7559+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7560+
7561+
struct llm_build_context llm(lctx, dummy, cb, false);
7562+
7563+
llm.init();
7564+
7565+
struct ggml_cgraph * result = llm.build_defrag(ids);
7566+
7567+
llm.free();
7568+
7569+
return result;
7570+
}
7571+
75087572
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
75097573
llama_batch dummy;
75107574
dummy.n_tokens = 0;
@@ -8030,32 +8094,16 @@ static int llama_decode_internal(
80308094
// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
80318095
// this way we eliminate any empty holes that may have been left by previous KV cache operations
80328096
//
8033-
// TODO: optimizations are possible:
8034-
// - multiple threads
8035-
// - avoid copying to the host memory when already there
8036-
//
8037-
// TODO: can we do all this on-device?
8038-
//
80398097
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
80408098
auto & kv_self = lctx.kv_self;
80418099

8042-
const auto & hparams = lctx.model.hparams;
8043-
8044-
const uint32_t n_layer = hparams.n_layer;
8045-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8046-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8047-
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8048-
const uint32_t n_used = kv_self.used;
8049-
8050-
const uint32_t kv_size = kv_self.size;
8100+
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8101+
const uint32_t n_used = kv_self.used;
80518102

80528103
assert(n_used <= n_kv);
80538104

80548105
const int64_t t_start = ggml_time_us();
80558106

8056-
std::vector<uint8_t> buf_k;
8057-
std::vector<uint8_t> buf_v;
8058-
80598107
// number of cells moved
80608108
uint32_t n_moves = 0;
80618109

@@ -8136,6 +8184,27 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
81368184
kv_self.cells[i] = llama_kv_cell();
81378185
}
81388186

8187+
#if 0
8188+
// CPU defrag
8189+
//
8190+
// TODO: optimizations are possible:
8191+
// - multiple threads
8192+
// - avoid copying to the host memory when already there
8193+
//
8194+
// likely not worth the effort, as we have ggml_graph based defrag
8195+
//
8196+
8197+
const auto & hparams = lctx.model.hparams;
8198+
8199+
const uint32_t n_layer = hparams.n_layer;
8200+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8201+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8202+
8203+
const uint32_t kv_size = kv_self.size;
8204+
8205+
std::vector<uint8_t> buf_k;
8206+
std::vector<uint8_t> buf_v;
8207+
81398208
for (uint32_t il = 0; il < n_layer; ++il) {
81408209
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
81418210
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
@@ -8188,6 +8257,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
81888257
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
81898258
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
81908259
}
8260+
#else
8261+
// ggml_graph defrag
8262+
8263+
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8264+
8265+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8266+
#endif
81918267

81928268
const int64_t t_end = ggml_time_us();
81938269

0 commit comments

Comments
 (0)