ggml: cache sin/cos for RoPE

JohannesGaessler · JohannesGaessler · commit dcc5520dd69f · 2024-01-13T11:31:36.000+01:00
diff --git a/ggml.c b/ggml.c
@@ -11720,6 +11720,24 @@ static void ggml_compute_forward_rope_f32(
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
             const int64_t p = pos[i2];
+
+            float *            cache = ((float *) (params->wdata)) + ith*ne0;
+            float   theta_base_cache = (float) p;
+            if (!is_glm && !is_neox) {
+                for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                    float cos_theta, sin_theta;
+                    rope_yarn(
+                        theta_base_cache, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                    );
+                    sin_theta *= sin_sign;
+
+                    cache[i0 + 0] = cos_theta;
+                    cache[i0 + 1] = sin_theta;
+
+                    theta_base_cache *= theta_scale;
+                }
+            }
+
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -11753,11 +11771,8 @@ static void ggml_compute_forward_rope_f32(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
-                        sin_theta *= sin_sign;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
 
                         // zeta scaling for xPos only:
                         float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -16722,6 +16737,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     }
                 } break;
             case GGML_OP_SOFT_MAX:
+            case GGML_OP_ROPE:
                 {
                     cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                 } break;