@@ -3561,7 +3561,7 @@ static bool llama_kv_cache_find_slot(
3561
3561
}
3562
3562
3563
3563
// gather and re-order
3564
- for (int32_t s = 0; s < n_seqs; ++s) {
3564
+ for (uint32_t s = 0; s < n_seqs; ++s) {
3565
3565
int32_t dst_id = s + min;
3566
3566
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
3567
3567
if (dst_id != src_id) {
@@ -3588,15 +3588,15 @@ static bool llama_kv_cache_find_slot(
3588
3588
int32_t cell_id = s + min;
3589
3589
llama_kv_cell & cell = cache.cells[cell_id];
3590
3590
3591
- if (last_pos != cell.pos + n_seq_tokens) {
3591
+ if (last_pos != cell.pos + (llama_pos) n_seq_tokens) {
3592
3592
// What should happen when the pos backtracks or skips a value?
3593
3593
// Clearing the state mid-batch would require special-casing which isn't done.
3594
3594
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
3595
3595
__func__, last_pos, cell.pos, batch.seq_id[s][0]);
3596
3596
}
3597
3597
cell.pos = last_pos;
3598
3598
cell.seq_id.clear();
3599
- for (uint32_t j = 0; j < batch.n_seq_id[s]; ++ j) {
3599
+ for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
3600
3600
const llama_seq_id seq_id = batch.seq_id[s][j];
3601
3601
cell.seq_id.insert(seq_id);
3602
3602
cache.cells[seq_id].tail = cell_id;
@@ -3803,7 +3803,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
3803
3803
uint32_t new_head = cache.size;
3804
3804
3805
3805
for (uint32_t i = 0; i < cache.size; ++i) {
3806
- if (cache.recurrent && i != seq_id) {
3806
+ if (cache.recurrent && (llama_seq_id) i != seq_id) {
3807
3807
cache.cells[i].tail = -1;
3808
3808
}
3809
3809
if (!cache.cells[i].has_seq_id(seq_id)) {
@@ -8992,6 +8992,7 @@ static struct ggml_tensor * llm_build_mamba(
8992
8992
8993
8993
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
8994
8994
cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs);
8995
+ cb(cur, "mamba_out", il);
8995
8996
8996
8997
return cur;
8997
8998
}
0 commit comments