fix nwarps > batch size

JohannesGaessler · JohannesGaessler · commit f4003cfba1d9 · 2024-05-27T19:42:16.000+02:00
diff --git a/ggml-cuda/fattn-vec-f16.cu b/ggml-cuda/fattn-vec-f16.cu
@@ -92,6 +92,10 @@ static __global__ void flash_attn_vec_ext_f16(
         for (int j0 = 0; j0 < ncols; j0 += nwarps) {
             const int j = j0 + threadIdx.y;
 
+            if (j0 + nwarps > ncols && j >= ncols) {
+                break;
+            }
+
             // Reuse KQ as temporary storage for converting Q to q8_1:
             int   * tmp_q_i32 = (int   *) &KQ[j*D];
             half2 * tmp_q_ds  = (half2 *) (tmp_q_i32 + D/sizeof(int));
diff --git a/ggml-cuda/fattn-vec-f32.cu b/ggml-cuda/fattn-vec-f32.cu
@@ -92,6 +92,10 @@ static __global__ void flash_attn_vec_ext_f32(
         for (int j0 = 0; j0 < ncols; j0 += nwarps) {
             const int j = j0 + threadIdx.y;
 
+            if (j0 + nwarps > ncols && j >= ncols) {
+                break;
+            }
+
             // Reuse KQ as temporary storage for converting Q to q8_1:
             int    * tmp_q_i32 = (int    *) &KQ[j*D];
             float2 * tmp_q_ds  = (float2 *) (tmp_q_i32 + D/sizeof(int));