add GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_VISION

HimariO · HimariO · commit 2a03ea7ee7ff · 2024-11-10T17:40:26.000+08:00
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -754,7 +754,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             if (ctx->has_qwen2vl_merger) {
                 Q = ggml_mrope_ext(
                     ctx0, Q, positions, nullptr, 
-                    d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 10000, 1, 0, 1, 32, 1);
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
             }
             Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
             Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
@@ -767,7 +767,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             if (ctx->has_qwen2vl_merger) {
                 K = ggml_mrope_ext(
                     ctx0, K, positions, nullptr, 
-                    d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 10000, 1, 0, 1, 32, 1);
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
             }
             K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
             K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
@@ -1294,8 +1294,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         idx = get_key_idx(ctx, KEY_USE_GELU);
         new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
         
-        idx = get_key_idx(ctx, KEY_USE_SILU);
-        new_clip->use_silu = gguf_get_val_bool(ctx, idx);
+        try {
+            idx = get_key_idx(ctx, KEY_USE_SILU);
+            new_clip->use_silu = gguf_get_val_bool(ctx, idx);
+        } catch (std::runtime_error & /*e*/) {
+            new_clip->use_silu = false;
+        }
 
         if (verbosity >= 1) {
             LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -237,7 +237,9 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#define GGML_ROPE_TYPE_NEOX 2
+#define GGML_ROPE_TYPE_NEOX   2
+#define GGML_ROPE_TYPE_MROPE  4
+#define GGML_ROPE_TYPE_VISION 12
 
 #define GGUF_MAGIC "GGUF"
 
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
@@ -414,11 +414,15 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&sections.v,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+    memcpy(&sections.v,  (int32_t *) dst->op_params + 11, sizeof(int)*4);
 
-    const bool is_mrope = sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0;
-    const bool is_vision = is_mrope && sections.v[3] > 0;
-    const bool is_neox = (mode & GGML_ROPE_TYPE_NEOX) & !(is_mrope || is_vision); // TODO: fix this with new rope type
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
+    }
 
     if (is_vision) {
         GGML_ASSERT(n_dims == ne00/2);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -11376,8 +11376,12 @@ static void ggml_compute_forward_rope_f32(
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_mrope = sections[0] > 0 || sections[1] > 0 || sections[2] > 0;
-    const bool is_vision = is_mrope && sections[3] > 0;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
 
     if (is_vision) {
         GGML_ASSERT(n_dims == ne0/2);
@@ -11556,8 +11560,12 @@ static void ggml_compute_forward_rope_f16(
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_mrope = sections[0] > 0 || sections[1] > 0 || sections[2] > 0;
-    const bool is_vision = is_mrope && sections[3] > 0;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
 
     if (is_vision) {
         GGML_ASSERT(n_dims == ne0/2);
@@ -11579,7 +11587,6 @@ static void ggml_compute_forward_rope_f16(
 
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
 
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
             if (!is_mrope) {
diff --git a/include/llama.h b/include/llama.h
@@ -107,9 +107,11 @@ extern "C" {
     };
 
     enum llama_rope_type {
-        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM = 0,
-        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_NONE   = -1,
+        LLAMA_ROPE_TYPE_NORM   = 0,
+        LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+        LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
     };
 
     enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -19930,7 +19930,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_BITNET:
         case LLM_ARCH_QWEN:
         case LLM_ARCH_QWEN2:
-        case LLM_ARCH_QWEN2VL:
         case LLM_ARCH_QWEN2MOE:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_PHI2:
@@ -19945,6 +19944,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_EXAONE:
         case LLM_ARCH_MINICPM3:
             return LLAMA_ROPE_TYPE_NEOX;
+        
+        case LLM_ARCH_QWEN2VL:
+            return LLAMA_ROPE_TYPE_MROPE;
 
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN: