Skip to content

Commit 2b5eb72

Browse files
Fixed __dp4a compute capability: 6.0 -> 6.1 (#2189)
1 parent f7d278f commit 2b5eb72

File tree

1 file changed

+11
-11
lines changed

1 file changed

+11
-11
lines changed

ggml-cuda.cu

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
12581258
}
12591259

12601260
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1261-
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1261+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
12621262
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
12631263

12641264
int vi;
@@ -1279,11 +1279,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
12791279
return sumi*d;
12801280
#else
12811281
return 0.0f; // only to satisfy the compiler
1282-
#endif // __CUDA_ARCH__ >= 600
1282+
#endif // __CUDA_ARCH__ >= 610
12831283
}
12841284

12851285
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1286-
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1286+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
12871287
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
12881288

12891289
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1304,11 +1304,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
13041304
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
13051305
#else
13061306
return 0.0f; // only to satisfy the compiler
1307-
#endif // __CUDA_ARCH__ >= 600
1307+
#endif // __CUDA_ARCH__ >= 610
13081308
}
13091309

13101310
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1311-
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1311+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
13121312
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
13131313

13141314
int qs;
@@ -1339,11 +1339,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
13391339
return sumi*d;
13401340
#else
13411341
return 0.0f; // only to satisfy the compiler
1342-
#endif // __CUDA_ARCH__ >= 600
1342+
#endif // __CUDA_ARCH__ >= 610
13431343
}
13441344

13451345
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1346-
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1346+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
13471347
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
13481348

13491349
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1373,11 +1373,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
13731373
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
13741374
#else
13751375
return 0.0f; // only to satisfy the compiler
1376-
#endif // __CUDA_ARCH__ >= 600
1376+
#endif // __CUDA_ARCH__ >= 610
13771377
}
13781378

13791379
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1380-
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1380+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
13811381
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
13821382

13831383
int vi;
@@ -1392,7 +1392,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
13921392
return sumi*d;
13931393
#else
13941394
return 0.0f; // only to satisfy the compiler
1395-
#endif // __CUDA_ARCH__ >= 600
1395+
#endif // __CUDA_ARCH__ >= 610
13961396
}
13971397

13981398
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -2417,7 +2417,7 @@ inline void ggml_cuda_op_mul_mat_vec(
24172417
src0->type == GGML_TYPE_Q5_1 ||
24182418
src0->type == GGML_TYPE_Q8_0;
24192419

2420-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
2420+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
24212421
#endif
24222422

24232423
if (use_mul_mat_vec_q) {

0 commit comments

Comments
 (0)