rwkv6: update cuda file name

zhiyuan1i · zhiyuan1i · commit 2fd1ce16a6d2 · 2024-11-02T23:45:32.000+11:00
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -36,7 +36,7 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
-#include "ggml-cuda/rwkv-wkv.cuh"
+#include "ggml-cuda/wkv6.cuh"
 
 #include <algorithm>
 #include <array>
diff --git a/ggml/src/ggml-cuda/wkv6.cu b/ggml/src/ggml-cuda/wkv6.cu
@@ -1,5 +1,5 @@
 #include "common.cuh"
-#include "rwkv-wkv.cuh"
+#include "wkv6.cuh"
 
 static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
     const int tid = threadIdx.x;
diff --git a/ggml/src/ggml-cuda/wkv6.cuh b/ggml/src/ggml-cuda/wkv6.cuh
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -3077,7 +3077,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "WIN_UNPART",
     "GET_REL_POS",
     "ADD_REL_POS",
-    "RWKV_WKV",
+    "RWKV_WKV6",
 
     "UNARY",
 
@@ -16709,11 +16709,13 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
     float * dst_data = (float *) dst->data;
     float * state = ((float *) dst->data) + C * T;
 
-    if (params->ith != 0) {
+    if ((size_t)params->ith >= H) {
         return;
     }
 
-    memset(dst_data, 0, T * C * sizeof(float));
+    size_t h_start = (H * params->ith) / params->nth;
+    size_t h_end = ((H * (size_t)(params->ith + 1)) / (size_t)params->nth < H) ? 
+                (H * (size_t)(params->ith + 1)) / (size_t)params->nth : H;
 
     float * k =          (float *) dst->src[0]->data;
     float * v =          (float *) dst->src[1]->data;
@@ -16726,6 +16728,13 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
     size_t h_stride = C / H;
     size_t h_stride_2d = head_size * head_size;
 
+    if (params->ith == 0) {
+        memset(dst_data, 0, T * C * sizeof(float));
+    }
+    ggml_barrier(params->threadpool);
+
+    
+
     #ifdef __AVX2__
     // AVX2 uses 256-bit vectors = 8 float32
     const int vec_size = 8;
@@ -16737,7 +16746,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         float * state_cur = state + state_offset;
         float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
 
-        for (size_t h = 0; h < H; h++) {
+        for (size_t h = h_start; h < h_end; h++) {
             size_t h_offset = h * h_stride;
             size_t t_h_offset = t_offset + h_offset;
             size_t h_2d_offset = h * h_stride_2d;
@@ -16815,7 +16824,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         float * state_cur = state + state_offset;
         float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
 
-        for (size_t h = 0; h < H; h++) {
+        for (size_t h = h_start; h < h_end; h++) {
             size_t h_offset = h * h_stride;
             size_t t_h_offset = t_offset + h_offset;
             size_t h_2d_offset = h * h_stride_2d;
@@ -16897,7 +16906,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         float * state_cur = state + state_offset;
         float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
 
-        for (size_t h = 0; h < H; h++) {
+        for (size_t h = h_start; h < h_end; h++) {
             size_t h_offset = h * h_stride;
             size_t t_h_offset = t_offset + h_offset;
             size_t h_2d_offset = h * h_stride_2d;
@@ -16958,7 +16967,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         float * state_cur = state + state_offset;
         float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
 
-        for (size_t h = 0; h < H; h++) {
+        for (size_t h = h_start; h < h_end; h++) {
             size_t h_offset = h * h_stride;
             size_t t_h_offset = t_offset + h_offset;
             size_t h_2d_offset = h * h_stride_2d;
@@ -17050,7 +17059,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         float * state_cur = state + state_offset;
         float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
 
-        for (size_t h = 0; h < H; h++) {
+        for (size_t h = h_start; h < h_end; h++) {
             size_t h_offset = h * h_stride;
             size_t t_h_offset = t_offset + h_offset;
             size_t h_2d_offset = h * h_stride_2d;