@@ -512,6 +512,14 @@ static size_t g_scratch_offset = 0;
512
512
513
513
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
514
514
515
+ [[noreturn]]
516
+ static __device__ void bad_arch () {
517
+ printf (" ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n " );
518
+ __trap ();
519
+
520
+ (void ) bad_arch; // suppress unused function warning
521
+ }
522
+
515
523
static __device__ __forceinline__ float warp_reduce_sum (float x) {
516
524
#pragma unroll
517
525
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
@@ -1972,8 +1980,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1972
1980
// second part effectively subtracts 8 from each quant value
1973
1981
return d4 * (sumi * ds8f.x - (8 *vdr/QI4_0) * ds8f.y );
1974
1982
#else
1975
- assert (false );
1976
- return 0 .0f ; // only to satisfy the compiler
1983
+ bad_arch ();
1977
1984
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1978
1985
}
1979
1986
@@ -2010,8 +2017,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2010
2017
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2011
2018
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2012
2019
#else
2013
- assert (false );
2014
- return 0 .0f ; // only to satisfy the compiler
2020
+ bad_arch ();
2015
2021
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2016
2022
}
2017
2023
@@ -2046,8 +2052,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2046
2052
// second part effectively subtracts 16 from each quant value
2047
2053
return d5 * (sumi * ds8f.x - (16 *vdr/QI5_0) * ds8f.y );
2048
2054
#else
2049
- assert (false );
2050
- return 0 .0f ; // only to satisfy the compiler
2055
+ bad_arch ();
2051
2056
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2052
2057
}
2053
2058
@@ -2092,8 +2097,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2092
2097
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2093
2098
2094
2099
#else
2095
- assert (false );
2096
- return 0 .0f ; // only to satisfy the compiler
2100
+ bad_arch ();
2097
2101
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2098
2102
}
2099
2103
@@ -2114,8 +2118,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2114
2118
2115
2119
return d8_0*d8_1 * sumi;
2116
2120
#else
2117
- assert (false );
2118
- return 0 .0f ; // only to satisfy the compiler
2121
+ bad_arch ();
2119
2122
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2120
2123
}
2121
2124
@@ -2145,8 +2148,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2145
2148
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2146
2149
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2147
2150
#else
2148
- assert (false );
2149
- return 0 .0f ; // only to satisfy the compiler
2151
+ bad_arch ();
2150
2152
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2151
2153
}
2152
2154
@@ -2181,8 +2183,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2181
2183
2182
2184
return dm2f.x *sumf_d - dm2f.y *sumf_m;
2183
2185
#else
2184
- assert (false );
2185
- return 0 .0f ; // only to satisfy the compiler
2186
+ bad_arch ();
2186
2187
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2187
2188
}
2188
2189
@@ -2219,8 +2220,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2219
2220
2220
2221
return d8 * (dm2f.x *sumi_d - dm2f.y *sumi_m);
2221
2222
#else
2222
- assert (false );
2223
- return 0 .0f ; // only to satisfy the compiler
2223
+ bad_arch ();
2224
2224
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2225
2225
}
2226
2226
@@ -2260,8 +2260,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2260
2260
2261
2261
return d3 * sumf;
2262
2262
#else
2263
- assert (false );
2264
- return 0 .0f ; // only to satisfy the compiler
2263
+ bad_arch ();
2265
2264
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2266
2265
}
2267
2266
@@ -2286,8 +2285,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2286
2285
2287
2286
return d3*d8 * sumi;
2288
2287
#else
2289
- assert (false );
2290
- return 0 .0f ; // only to satisfy the compiler
2288
+ bad_arch ();
2291
2289
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2292
2290
}
2293
2291
@@ -2320,8 +2318,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2320
2318
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2321
2319
2322
2320
#else
2323
- assert (false );
2324
- return 0 .0f ; // only to satisfy the compiler
2321
+ bad_arch ();
2325
2322
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2326
2323
}
2327
2324
@@ -2354,8 +2351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2354
2351
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2355
2352
2356
2353
#else
2357
- assert (false );
2358
- return 0 .0f ; // only to satisfy the compiler
2354
+ bad_arch ();
2359
2355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2360
2356
}
2361
2357
@@ -2395,8 +2391,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2395
2391
return dm5f.x *sumf_d - dm5f.y *sumf_m;
2396
2392
2397
2393
#else
2398
- assert (false );
2399
- return 0 .0f ; // only to satisfy the compiler
2394
+ bad_arch ();
2400
2395
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2401
2396
}
2402
2397
@@ -2429,8 +2424,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2429
2424
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2430
2425
2431
2426
#else
2432
- assert (false );
2433
- return 0 .0f ; // only to satisfy the compiler
2427
+ bad_arch ();
2434
2428
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2435
2429
}
2436
2430
@@ -2460,8 +2454,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2460
2454
2461
2455
return d*sumf;
2462
2456
#else
2463
- assert (false );
2464
- return 0 .0f ; // only to satisfy the compiler
2457
+ bad_arch ();
2465
2458
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2466
2459
}
2467
2460
@@ -2492,8 +2485,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2492
2485
return d6 * sumf_d;
2493
2486
2494
2487
#else
2495
- assert (false );
2496
- return 0 .0f ; // only to satisfy the compiler
2488
+ bad_arch ();
2497
2489
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2498
2490
}
2499
2491
@@ -3359,8 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3359
3351
return dall * sumf_d - dmin * sumf_m;
3360
3352
3361
3353
#else
3362
- assert (false );
3363
- return 0 .0f ; // only to satisfy the compiler
3354
+ bad_arch ();
3364
3355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3365
3356
3366
3357
#endif
@@ -3543,8 +3534,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3543
3534
return d * sumf_d;
3544
3535
3545
3536
#else
3546
- assert (false );
3547
- return 0 .0f ; // only to satisfy the compiler
3537
+ bad_arch ();
3548
3538
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3549
3539
3550
3540
#endif
@@ -3954,7 +3944,7 @@ template <bool need_check> static __global__ void
3954
3944
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3955
3945
#else
3956
3946
(void ) vec_dot_q4_0_q8_1_mul_mat;
3957
- assert ( false );
3947
+ bad_arch ( );
3958
3948
#endif // __CUDA_ARCH__ >= CC_VOLTA
3959
3949
}
3960
3950
@@ -4023,7 +4013,7 @@ template <bool need_check> static __global__ void
4023
4013
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4024
4014
#else
4025
4015
(void ) vec_dot_q4_1_q8_1_mul_mat;
4026
- assert ( false );
4016
+ bad_arch ( );
4027
4017
#endif // __CUDA_ARCH__ >= CC_VOLTA
4028
4018
}
4029
4019
@@ -4090,7 +4080,7 @@ template <bool need_check> static __global__ void
4090
4080
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4091
4081
#else
4092
4082
(void ) vec_dot_q5_0_q8_1_mul_mat;
4093
- assert ( false );
4083
+ bad_arch ( );
4094
4084
#endif // __CUDA_ARCH__ >= CC_VOLTA
4095
4085
}
4096
4086
@@ -4157,7 +4147,7 @@ mul_mat_q5_1(
4157
4147
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4158
4148
#else
4159
4149
(void ) vec_dot_q5_1_q8_1_mul_mat;
4160
- assert ( false );
4150
+ bad_arch ( );
4161
4151
#endif // __CUDA_ARCH__ >= CC_VOLTA
4162
4152
}
4163
4153
@@ -4224,7 +4214,7 @@ template <bool need_check> static __global__ void
4224
4214
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4225
4215
#else
4226
4216
(void ) vec_dot_q8_0_q8_1_mul_mat;
4227
- assert ( false );
4217
+ bad_arch ( );
4228
4218
#endif // __CUDA_ARCH__ >= CC_VOLTA
4229
4219
}
4230
4220
@@ -4291,7 +4281,7 @@ mul_mat_q2_K(
4291
4281
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4292
4282
#else
4293
4283
(void ) vec_dot_q2_K_q8_1_mul_mat;
4294
- assert ( false );
4284
+ bad_arch ( );
4295
4285
#endif // __CUDA_ARCH__ >= CC_VOLTA
4296
4286
}
4297
4287
@@ -4360,7 +4350,7 @@ template <bool need_check> static __global__ void
4360
4350
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4361
4351
#else
4362
4352
(void ) vec_dot_q3_K_q8_1_mul_mat;
4363
- assert ( false );
4353
+ bad_arch ( );
4364
4354
#endif // __CUDA_ARCH__ >= CC_VOLTA
4365
4355
}
4366
4356
@@ -4429,7 +4419,7 @@ template <bool need_check> static __global__ void
4429
4419
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4430
4420
#else
4431
4421
(void ) vec_dot_q4_K_q8_1_mul_mat;
4432
- assert ( false );
4422
+ bad_arch ( );
4433
4423
#endif // __CUDA_ARCH__ >= CC_VOLTA
4434
4424
}
4435
4425
@@ -4496,7 +4486,7 @@ mul_mat_q5_K(
4496
4486
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4497
4487
#else
4498
4488
(void ) vec_dot_q5_K_q8_1_mul_mat;
4499
- assert ( false );
4489
+ bad_arch ( );
4500
4490
#endif // __CUDA_ARCH__ >= CC_VOLTA
4501
4491
}
4502
4492
@@ -4565,7 +4555,7 @@ template <bool need_check> static __global__ void
4565
4555
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4566
4556
#else
4567
4557
(void ) vec_dot_q6_K_q8_1_mul_mat;
4568
- assert ( false );
4558
+ bad_arch ( );
4569
4559
#endif // __CUDA_ARCH__ >= CC_VOLTA
4570
4560
}
4571
4561
0 commit comments