Revert to int for blockIdx.x, blockDim.x, and threadIdx.x.

dranger003 · dranger003 · commit 9acb43d7fa0b · 2024-04-10T07:48:22.000-04:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1706,8 +1706,8 @@ static __global__ void k_compute_batched_ptrs(
         size_t  nb12, size_t  nb13,
         size_t  nbd2, size_t  nbd3,
         int64_t r2,   int64_t r3) {
-    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
+    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
 
     if (i13 >= ne13 || i12 >= ne12) {
         return;
diff --git a/ggml-cuda/convert.cu b/ggml-cuda/convert.cu
@@ -5,7 +5,7 @@
 
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
-    const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
+    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
 
     if (i >= k) {
         return;
@@ -71,9 +71,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
     const int64_t i = blockIdx.x;
 
     // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t il  = tid/8;
-    const int64_t ir  = tid%8;
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
     const int64_t ib = 8*i + ir;
     if (ib >= nb32) {
         return;
@@ -99,9 +99,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
     const int64_t i = blockIdx.x;
 
     // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t il  = tid/8;
-    const int64_t ir  = tid%8;
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
     const int64_t ib = 8*i + ir;
     if (ib >= nb32) {
         return;
@@ -128,10 +128,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
     const int64_t i   = blockIdx.x;
     const block_q2_K * x = (const block_q2_K *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t n   = tid/32;
-    const int64_t l   = tid - 32*n;
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
     const int64_t is  = 8*n + l/16;
 
     const uint8_t q = x[i].qs[32*n + l];
@@ -159,16 +159,16 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int64_t i = blockIdx.x;
+    const int i = blockIdx.x;
     const block_q3_K * x = (const block_q3_K *) vx;
 
 #if QK_K == 256
-    const int64_t r = threadIdx.x/4;
-    const int64_t tid = r/2;
-    const int64_t is0 = r%2;
-    const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
-    const int64_t n = tid / 4;
-    const int64_t j = tid - 4*n;
+    const int r = threadIdx.x/4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
 
     uint8_t m = 1 << (4*n + j);
     int64_t is = 8*n + 2*j + is0;
@@ -187,11 +187,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
 
     for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 #else
-    const int64_t tid = threadIdx.x;
-    const int64_t is  = tid/16;  // 0 or 1
-    const int64_t il  = tid%16;  // 0...15
-    const int64_t im  = il/8;    // 0...1
-    const int64_t in  = il%8;    // 0...7
+    const int tid = threadIdx.x;
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
 
     dst_t * y = yy + i*QK_K + 16*is + il;
 
@@ -229,11 +229,11 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
 
 #if QK_K == 256
     // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t il  = tid/8;
-    const int64_t ir  = tid%8;
-    const int64_t is  = 2*il;
-    const int64_t n   = 4;
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
 
     dst_t * y = yy + i*QK_K + 64*il + n*ir;
 
@@ -252,7 +252,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
         y[l +32] = d2 * (q[l] >>  4) - m2;
     }
 #else
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
     const uint8_t * q = x[i].qs;
     dst_t * y = yy + i*QK_K;
     const float d = (float)x[i].dm[0];
@@ -270,10 +270,10 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
 
 #if QK_K == 256
     // assume 64 threads - this is very slightly better than the one below
-    const int64_t tid = threadIdx.x;
-    const int64_t il  = tid/16;   // il is in 0...3
-    const int64_t ir  = tid%16;   // ir is in 0...15
-    const int64_t is  = 2*il;     // is is in 0...6
+    const int tid = threadIdx.x;
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
 
     dst_t * y = yy + i*QK_K + 64*il + 2*ir;
 
@@ -296,11 +296,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
     y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
     y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 #else
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
     const uint8_t q = x[i].qs[tid];
-    const int64_t im = tid/8;  // 0...3
-    const int64_t in = tid%8;  // 0...7
-    const int64_t is = tid/16; // 0 or 1
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
     const uint8_t h = x[i].qh[in] >> im;
     const float d = x[i].d;
     dst_t * y = yy + i*QK_K + tid;
@@ -317,10 +317,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 #if QK_K == 256
 
     // assume 64 threads - this is very slightly better than the one below
-    const int64_t tid = threadIdx.x;
-    const int64_t ip  = tid/32;   // ip is 0 or 1
-    const int64_t il  = tid - 32*ip; // 0...32
-    const int64_t is  = 8*ip + il/16;
+    const int tid = threadIdx.x;
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
 
     dst_t * y = yy + i*QK_K + 128*ip + il;
 
@@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 #else
 
     // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t ip  = tid/16;         // 0 or 1
-    const int64_t il  = tid - 16*ip;    // 0...15
+    const int tid = threadIdx.x;
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
 
     dst_t * y = yy + i*QK_K + 16*ip + il;
 
@@ -360,10 +360,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
     const int64_t i   = blockIdx.x;
     const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint16_t * q2 = x[i].qs + 4*ib;
     const uint8_t  * aux8 = (const uint8_t *)q2;
@@ -384,10 +384,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
     const int64_t i   = blockIdx.x;
     const block_iq2_xs * x = (const block_iq2_xs *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint16_t * q2 = x[i].qs + 4*ib;
     const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
@@ -406,10 +406,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
     const int64_t i   = blockIdx.x;
     const block_iq2_s * x = (const block_iq2_s *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
@@ -427,10 +427,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
     const int64_t i   = blockIdx.x;
     const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint8_t  * q3 = x[i].qs + 8*ib;
     const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
@@ -455,10 +455,10 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
     const int64_t i   = blockIdx.x;
     const block_iq3_s * x = (const block_iq3_s *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint8_t * qs = x[i].qs + 8*ib;
     const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
@@ -481,10 +481,10 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
     const int64_t i   = blockIdx.x;
     const block_iq1_s * x = (const block_iq1_s  *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
     const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
@@ -507,10 +507,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
     const int64_t i   = blockIdx.x;
     const block_iq1_m * x = (const block_iq1_m  *) vx;
 
-    const int64_t tid = threadIdx.x;
+    const int tid = threadIdx.x;
 #if QK_K == 256
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint16_t * sc = (const uint16_t *)x[i].scales;
     iq1m_scale_t scale;
@@ -538,9 +538,9 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
     const int64_t i   = blockIdx.x;
     const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
 
-    const int64_t tid = threadIdx.x;
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 4*il;
     const uint8_t  * q4 = x[ib].qs + 4*il;
     const float d = (float)x[ib].d;
@@ -557,9 +557,9 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
     const int64_t i   = blockIdx.x;
     const block_iq4_xs * x = (const block_iq4_xs *)vx;
 
-    const int64_t tid = threadIdx.x;
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
+    const int tid = threadIdx.x;
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 4*il;
     const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
     const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
@@ -707,7 +707,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
 
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
-    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
         return;