@@ -5111,6 +5111,53 @@ struct llm_build_context {
5111
5111
return gf;
5112
5112
}
5113
5113
5114
+ struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5115
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5116
+
5117
+ for (int il = 0; il < n_layer; ++il) {
5118
+ for (int i = 0; i < n_kv; ++i) {
5119
+ const int id = ids[i];
5120
+
5121
+ if (i == id || id == n_kv) {
5122
+ continue;
5123
+ }
5124
+
5125
+ int nm = 1;
5126
+
5127
+ while (i + nm < n_kv && (int) ids[i + nm] == id + nm) {
5128
+ nm++;
5129
+ }
5130
+
5131
+ ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5132
+ n_embd_k_gqa, nm,
5133
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5134
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5135
+
5136
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5137
+ n_embd_k_gqa, nm,
5138
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5139
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5140
+
5141
+ ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5142
+ nm, n_embd_v_gqa,
5143
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5144
+ ggml_row_size(kv_self.v_l[il]->type, i));
5145
+
5146
+ ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5147
+ nm, n_embd_v_gqa,
5148
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5149
+ ggml_row_size(kv_self.v_l[il]->type, id));
5150
+
5151
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5152
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5153
+
5154
+ i += nm - 1;
5155
+ }
5156
+ }
5157
+
5158
+ return gf;
5159
+ }
5160
+
5114
5161
struct ggml_cgraph * build_llama() {
5115
5162
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5116
5163
@@ -7505,6 +7552,23 @@ struct llm_build_context {
7505
7552
}
7506
7553
};
7507
7554
7555
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7556
+ llama_batch dummy;
7557
+ dummy.n_tokens = 0;
7558
+
7559
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7560
+
7561
+ struct llm_build_context llm(lctx, dummy, cb, false);
7562
+
7563
+ llm.init();
7564
+
7565
+ struct ggml_cgraph * result = llm.build_defrag(ids);
7566
+
7567
+ llm.free();
7568
+
7569
+ return result;
7570
+ }
7571
+
7508
7572
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7509
7573
llama_batch dummy;
7510
7574
dummy.n_tokens = 0;
@@ -8030,12 +8094,6 @@ static int llama_decode_internal(
8030
8094
// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
8031
8095
// this way we eliminate any empty holes that may have been left by previous KV cache operations
8032
8096
//
8033
- // TODO: optimizations are possible:
8034
- // - multiple threads
8035
- // - avoid copying to the host memory when already there
8036
- //
8037
- // TODO: can we do all this on-device?
8038
- //
8039
8097
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8040
8098
auto & kv_self = lctx.kv_self;
8041
8099
@@ -8053,9 +8111,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8053
8111
8054
8112
const int64_t t_start = ggml_time_us();
8055
8113
8056
- std::vector<uint8_t> buf_k;
8057
- std::vector<uint8_t> buf_v;
8058
-
8059
8114
// number of cells moved
8060
8115
uint32_t n_moves = 0;
8061
8116
@@ -8136,6 +8191,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8136
8191
kv_self.cells[i] = llama_kv_cell();
8137
8192
}
8138
8193
8194
+ #if 0
8195
+ // CPU defrag
8196
+ //
8197
+ // TODO: optimizations are possible:
8198
+ // - multiple threads
8199
+ // - avoid copying to the host memory when already there
8200
+ //
8201
+ // likely not worth the effort, as we have ggml_graph based defrag
8202
+ //
8203
+
8204
+ std::vector<uint8_t> buf_k;
8205
+ std::vector<uint8_t> buf_v;
8206
+
8139
8207
for (uint32_t il = 0; il < n_layer; ++il) {
8140
8208
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
8141
8209
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
@@ -8188,6 +8256,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8188
8256
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8189
8257
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8190
8258
}
8259
+ #else
8260
+ // ggml_graph defrag
8261
+
8262
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8263
+
8264
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8265
+ #endif
8191
8266
8192
8267
const int64_t t_end = ggml_time_us();
8193
8268
0 commit comments