@@ -7131,7 +7131,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
7131
7131
} break;
7132
7132
case GGML_OP_MUL_MAT:
7133
7133
{
7134
- ggml_tensor * b = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, w->ne[0], 512);
7134
+ ggml_tensor * b = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3] );
7135
7135
op_tensor = ggml_mul_mat(ctx, w, b);
7136
7136
} break;
7137
7137
case GGML_OP_MUL_MAT_ID:
@@ -7171,18 +7171,38 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
7171
7171
} break;
7172
7172
case GGML_OP_SSM_CONV:
7173
7173
{
7174
- // TODO: ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
7175
- op_tensor = ggml_ssm_conv(ctx, nullptr, w);
7174
+ // FIXME
7175
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
7176
+ op_tensor = ggml_ssm_conv(ctx, conv_x, w);
7176
7177
} break;
7177
7178
case GGML_OP_SSM_SCAN:
7178
7179
{
7179
- // TODO: ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
7180
- op_tensor = ggml_ssm_scan(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
7180
+ // FIXME
7181
+ const int64_t d_state = w->ne[0];
7182
+ const int64_t d_inner = w->ne[1];
7183
+ const int64_t n_seq_tokens = 512;
7184
+ const int64_t n_seqs = 1;
7185
+ ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
7186
+ ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
7187
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
7188
+ ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
7189
+ ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
7190
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
7181
7191
} break;
7182
7192
case GGML_OP_RWKV_WKV:
7183
7193
{
7184
- // TODO: ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
7185
- op_tensor = ggml_rwkv_wkv(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr);
7194
+ // FIXME
7195
+ const int64_t S = 123;
7196
+ const int64_t H = 123;
7197
+ const int64_t n_tokens = 123;
7198
+ const int64_t n_seqs = 123;
7199
+ ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens);
7200
+ ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
7201
+ ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
7202
+ ggml_tensor * tf = w;
7203
+ ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
7204
+ ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
7205
+ op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
7186
7206
} break;
7187
7207
default:
7188
7208
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
@@ -7462,7 +7482,7 @@ static bool llm_load_tensors(
7462
7482
7463
7483
// tensors with "bias" suffix are always used with GGML_OP_ADD
7464
7484
ggml_op op;
7465
- bool bias = strcmp(tn.suffix, "bias") == 0;
7485
+ bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
7466
7486
if (bias) {
7467
7487
op = GGML_OP_ADD;
7468
7488
} else {
@@ -19690,7 +19710,7 @@ struct llama_context * llama_new_context_with_model(
19690
19710
int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
19691
19711
19692
19712
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
19693
- gf_pp = llama_build_graph(*ctx, ubatch_pp, false );
19713
+ gf_pp = llama_build_graph(*ctx, ubatch_pp, true );
19694
19714
if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
19695
19715
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
19696
19716
llama_free(ctx);
0 commit comments