update llama_hparams

HimariO · HimariO · commit 1f67ba020f7c · 2024-11-10T17:41:46.000+08:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1980,6 +1980,12 @@ def set_vocab(self):
 class Qwen2VLModel(Model):
     model_arch = gguf.MODEL_ARCH.QWEN2VL
 
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
+        mrope_section += [0] * max(0, 4 - len(mrope_section))
+        self.gguf_writer.add_rope_dimension_sections(mrope_section)
+
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py
@@ -133,7 +133,10 @@ def main(args):
     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
     fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # BUG: not sure what this does
     fout.add_name(model_name)
-    # fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+    """
+    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig, 
+            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
+    """
 
     processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
     # breakpoint()
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -119,6 +119,7 @@ class Attention:
 
     class Rope:
         DIMENSION_COUNT         = "{arch}.rope.dimension_count"
+        DIMENSION_SECTIONS      = "{arch}.rope.dimension_sections"
         FREQ_BASE               = "{arch}.rope.freq_base"
         SCALING_TYPE            = "{arch}.rope.scaling.type"
         SCALING_FACTOR          = "{arch}.rope.scaling.factor"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -720,6 +720,9 @@ def add_pooling_type(self, value: PoolingType) -> None:
 
     def add_rope_dimension_count(self, count: int) -> None:
         self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
+    
+    def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
+        self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
 
     def add_rope_freq_base(self, value: float) -> None:
         self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -308,6 +308,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_SECTIONS,
     LLM_KV_ROPE_FREQ_BASE,
     LLM_KV_ROPE_SCALE_LINEAR,
     LLM_KV_ROPE_SCALING_TYPE,
@@ -424,6 +425,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_SECTIONS,          "%s.rope.dimension_sections"              },
     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
     { LLM_KV_ROPE_SCALE_LINEAR,                "%s.rope.scale_linear"                    },
     { LLM_KV_ROPE_SCALING_TYPE,                "%s.rope.scaling.type"                    },
@@ -2407,11 +2409,12 @@ struct llama_hparams {
     uint32_t time_decay_extra_dim = 0;
     uint32_t wkv_head_size = 0;
 
-    float    rope_attn_factor = 1.0f;
-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_ctx_orig_yarn;
-    float    rope_yarn_log_mul;
+    float                   rope_attn_factor = 1.0f;
+    float                   rope_freq_base_train;
+    float                   rope_freq_scale_train;
+    uint32_t                n_ctx_orig_yarn;
+    float                   rope_yarn_log_mul;
+    std::array<uint32_t, 4> rope_mrope_sections;
 
     // for State Space Models
     uint32_t ssm_d_conv  = 0;
@@ -2466,8 +2469,9 @@ struct llama_hparams {
         if (this->n_ff_shexp         != other.n_ff_shexp)         return true;
         if (this->n_expert_shared    != other.n_expert_shared)    return true;
 
-        if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
+        if (this->rope_finetuned  != other.rope_finetuned)          return true;
+        if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn)         return true;
+        if (this->rope_mrope_sections != other.rope_mrope_sections) return true;
 
         if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
         if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -5620,8 +5624,12 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
-        case LLM_ARCH_QWEN2:
         case LLM_ARCH_QWEN2VL:
+            {
+                std::fill(hparams.rope_mrope_sections.begin(), hparams.rope_mrope_sections.end(), 0);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_mrope_sections, 4, true);
+            }
+        case LLM_ARCH_QWEN2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
@@ -12398,7 +12406,7 @@ struct llm_build_context {
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-        int sections[4] = {16, 24, 24, 0};  // TODO: move this into gguf model file.
+        int * sections = (int *)hparams.rope_mrope_sections.data();
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;