Skip to content

Commit dbaf7e6

Browse files
nyxkragearthw
authored andcommitted
llama : fix llama3.1 rope_freqs not respecting custom head_dim (ggml-org#9141)
* fix: llama3.1 rope_freqs not respecting custom head_dim * fix: use potential head_dim for Exaone
1 parent a57abe3 commit dbaf7e6

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1572,7 +1572,7 @@ def prepare_tensors(self):
15721572
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
15731573
if rope_scaling.get("rope_type", '').lower() == "llama3":
15741574
base = self.hparams.get("rope_theta", 10000.0)
1575-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1575+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
15761576
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
15771577

15781578
factor = rope_scaling.get("factor", 8.0)
@@ -3820,7 +3820,7 @@ def prepare_tensors(self):
38203820
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
38213821
if rope_scaling.get("rope_type", '').lower() == "llama3":
38223822
base = self.hparams.get("rope_theta", 10000.0)
3823-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3823+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
38243824
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
38253825

38263826
factor = rope_scaling.get("factor", 8.0)

src/llama.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6609,6 +6609,7 @@ static bool llm_load_tensors(
66096609
const int64_t n_embd_gqa = n_embd_v_gqa;
66106610
const int64_t n_vocab = hparams.n_vocab;
66116611
const int64_t n_vocab_type = hparams.n_vocab_type;
6612+
const int64_t n_rot = hparams.n_rot;
66126613
const int64_t n_expert = hparams.n_expert;
66136614
const int64_t n_expert_used = hparams.n_expert_used;
66146615
const int64_t n_ctx_train = hparams.n_ctx_train;
@@ -6666,7 +6667,7 @@ static bool llm_load_tensors(
66666667

66676668
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
66686669

6669-
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6670+
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
66706671

66716672
if (n_expert == 0) {
66726673
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
@@ -8197,7 +8198,7 @@ static bool llm_load_tensors(
81978198
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
81988199

81998200
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
8200-
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8201+
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
82018202
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
82028203
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
82038204
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});

0 commit comments

Comments
 (0)