Fix some more int overflow in softmax.

dranger003 · dranger003 · commit 4947778d3bf5 · 2024-04-28T17:07:41.000-04:00
diff --git a/ggml-cuda/softmax.cu b/ggml-cuda/softmax.cu
@@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
     extern __shared__ float data_soft_max_f32[];
     float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
     // shared memory buffer to cache values between iterations:
-    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
+    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
 
     float max_val = -INFINITY;
 
@@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
             break;
         }
 
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
+        const int64_t ix = (int64_t)rowx*ncols + col;
+        const int64_t iy = (int64_t)rowy*ncols + col;
 
         const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
 
@@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
             return;
         }
 
-        const int idst = rowx*ncols + col;
+        const int64_t idst = (int64_t)rowx*ncols + col;
         dst[idst] = vals[col] * inv_sum;
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f`
`28`	`28`	`extern __shared__ float data_soft_max_f32[];`
`29`	`29`	`float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication`
`30`	`30`	`// shared memory buffer to cache values between iterations:`
`31`		`- float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;`
	`31`	`+ float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;`
`32`	`32`
`33`	`33`	`float max_val = -INFINITY;`
`34`	`34`
`@@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f`
`40`	`40`	`break;`
`41`	`41`	`}`
`42`	`42`
`43`		`- const int ix = rowx*ncols + col;`
`44`		`- const int iy = rowy*ncols + col;`
	`43`	`+ const int64_t ix = (int64_t)rowx*ncols + col;`
	`44`	`+ const int64_t iy = (int64_t)rowy*ncols + col;`
`45`	`45`
`46`	`46`	`const float val = x[ix]scale + (mask ? mask[iy] : 0.0f) + (pos ? slopepos[col] : 0.0f);`
`47`	`47`
`@@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f`
`109`	`109`	`return;`
`110`	`110`	`}`
`111`	`111`
`112`		`- const int idst = rowx*ncols + col;`
	`112`	`+ const int64_t idst = (int64_t)rowx*ncols + col;`
`113`	`113`	`dst[idst] = vals[col] * inv_sum;`
`114`	`114`	`}`
`115`	`115`	`}`