@@ -2455,6 +2455,7 @@ struct llama_layer {
2455
2455
// long rope factors
2456
2456
struct ggml_tensor * rope_long = nullptr;
2457
2457
struct ggml_tensor * rope_short = nullptr;
2458
+ struct ggml_tensor * rope_freqs = nullptr;
2458
2459
2459
2460
// bitnet scale
2460
2461
struct ggml_tensor * wq_scale;
@@ -6055,6 +6056,8 @@ static bool llm_load_tensors(
6055
6056
6056
6057
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6057
6058
6059
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6060
+
6058
6061
if (n_expert == 0) {
6059
6062
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6060
6063
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -8532,6 +8535,10 @@ struct llm_build_context {
8532
8535
// choose long/short freq factors based on the context size
8533
8536
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
8534
8537
8538
+ if (model.layers[il].rope_freqs != nullptr) {
8539
+ return model.layers[il].rope_freqs;
8540
+ }
8541
+
8535
8542
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
8536
8543
return model.layers[il].rope_long;
8537
8544
}
@@ -8726,6 +8733,9 @@ struct llm_build_context {
8726
8733
8727
8734
// self-attention
8728
8735
{
8736
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
8737
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
8738
+
8729
8739
// compute Q and K and RoPE them
8730
8740
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
8731
8741
cb(Qcur, "Qcur", il);
@@ -8749,14 +8759,14 @@ struct llm_build_context {
8749
8759
}
8750
8760
8751
8761
Qcur = ggml_rope_ext(
8752
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr ,
8762
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors ,
8753
8763
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8754
8764
ext_factor, attn_factor, beta_fast, beta_slow
8755
8765
);
8756
8766
cb(Qcur, "Qcur", il);
8757
8767
8758
8768
Kcur = ggml_rope_ext(
8759
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr ,
8769
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors ,
8760
8770
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8761
8771
ext_factor, attn_factor, beta_fast, beta_slow
8762
8772
);
0 commit comments