File tree Expand file tree Collapse file tree 1 file changed +0
-3
lines changed Expand file tree Collapse file tree 1 file changed +0
-3
lines changed Original file line number Diff line number Diff line change @@ -14613,23 +14613,20 @@ static int llama_decode_internal(
14613
14613
const struct llama_hparams & hparams = model.hparams;
14614
14614
const int64_t n_layer = hparams.n_layer;
14615
14615
const int64_t kv_head = kv_self.head;
14616
- std::vector<void *> kv_cache_ptrs;
14617
14616
std::vector<void *> k_cache_ptrs;
14618
14617
std::vector<void *> v_cache_ptrs;
14619
14618
for (int il = 0; il < n_layer; ++il) {
14620
14619
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
14621
14620
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
14622
14621
ggml_tensor * tmp_tensor = kv_self.k_l[il];
14623
14622
size_t tmp_offset = (ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa))*kv_head;
14624
- kv_cache_ptrs.push_back(static_cast<char*>(tmp_tensor->data) + tmp_offset);
14625
14623
k_cache_ptrs.push_back(static_cast<char*>(tmp_tensor->data) + tmp_offset);
14626
14624
tmp_tensor = kv_self.v_l[il];
14627
14625
if (cparams.flash_attn) {
14628
14626
tmp_offset = (kv_head)*ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
14629
14627
} else {
14630
14628
tmp_offset = (kv_head)*ggml_element_size(kv_self.v_l[il]);
14631
14629
}
14632
- kv_cache_ptrs.push_back(static_cast<char*>(tmp_tensor->data) + tmp_offset);
14633
14630
v_cache_ptrs.push_back(static_cast<char*>(tmp_tensor->data) + tmp_offset);
14634
14631
}
14635
14632
You can’t perform that action at this time.
0 commit comments