Skip to content

Commit 1398823

Browse files
authored
cuda : replace asserts in wrong architecture checks with __trap (#4556)
* cuda : replace asserts in wrong architecture checks with __trap * make bad_arch noreturn, remove returns
1 parent d3223af commit 1398823

File tree

1 file changed

+36
-46
lines changed

1 file changed

+36
-46
lines changed

ggml-cuda.cu

Lines changed: 36 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,14 @@ static size_t g_scratch_offset = 0;
512512

513513
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
514514

515+
[[noreturn]]
516+
static __device__ void bad_arch() {
517+
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
518+
__trap();
519+
520+
(void) bad_arch; // suppress unused function warning
521+
}
522+
515523
static __device__ __forceinline__ float warp_reduce_sum(float x) {
516524
#pragma unroll
517525
for (int mask = 16; mask > 0; mask >>= 1) {
@@ -1972,8 +1980,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
19721980
// second part effectively subtracts 8 from each quant value
19731981
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
19741982
#else
1975-
assert(false);
1976-
return 0.0f; // only to satisfy the compiler
1983+
bad_arch();
19771984
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
19781985
}
19791986

@@ -2010,8 +2017,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
20102017
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
20112018
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
20122019
#else
2013-
assert(false);
2014-
return 0.0f; // only to satisfy the compiler
2020+
bad_arch();
20152021
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20162022
}
20172023

@@ -2046,8 +2052,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
20462052
// second part effectively subtracts 16 from each quant value
20472053
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
20482054
#else
2049-
assert(false);
2050-
return 0.0f; // only to satisfy the compiler
2055+
bad_arch();
20512056
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20522057
}
20532058

@@ -2092,8 +2097,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
20922097
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
20932098

20942099
#else
2095-
assert(false);
2096-
return 0.0f; // only to satisfy the compiler
2100+
bad_arch();
20972101
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20982102
}
20992103

@@ -2114,8 +2118,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
21142118

21152119
return d8_0*d8_1 * sumi;
21162120
#else
2117-
assert(false);
2118-
return 0.0f; // only to satisfy the compiler
2121+
bad_arch();
21192122
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21202123
}
21212124

@@ -2145,8 +2148,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
21452148
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
21462149
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
21472150
#else
2148-
assert(false);
2149-
return 0.0f; // only to satisfy the compiler
2151+
bad_arch();
21502152
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21512153
}
21522154

@@ -2181,8 +2183,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
21812183

21822184
return dm2f.x*sumf_d - dm2f.y*sumf_m;
21832185
#else
2184-
assert(false);
2185-
return 0.0f; // only to satisfy the compiler
2186+
bad_arch();
21862187
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21872188
}
21882189

@@ -2219,8 +2220,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
22192220

22202221
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
22212222
#else
2222-
assert(false);
2223-
return 0.0f; // only to satisfy the compiler
2223+
bad_arch();
22242224
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22252225
}
22262226

@@ -2260,8 +2260,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
22602260

22612261
return d3 * sumf;
22622262
#else
2263-
assert(false);
2264-
return 0.0f; // only to satisfy the compiler
2263+
bad_arch();
22652264
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22662265
}
22672266

@@ -2286,8 +2285,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
22862285

22872286
return d3*d8 * sumi;
22882287
#else
2289-
assert(false);
2290-
return 0.0f; // only to satisfy the compiler
2288+
bad_arch();
22912289
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22922290
}
22932291

@@ -2320,8 +2318,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
23202318
return dm4f.x*sumf_d - dm4f.y*sumf_m;
23212319

23222320
#else
2323-
assert(false);
2324-
return 0.0f; // only to satisfy the compiler
2321+
bad_arch();
23252322
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
23262323
}
23272324

@@ -2354,8 +2351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
23542351
return dm4f.x*sumf_d - dm4f.y*sumf_m;
23552352

23562353
#else
2357-
assert(false);
2358-
return 0.0f; // only to satisfy the compiler
2354+
bad_arch();
23592355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
23602356
}
23612357

@@ -2395,8 +2391,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
23952391
return dm5f.x*sumf_d - dm5f.y*sumf_m;
23962392

23972393
#else
2398-
assert(false);
2399-
return 0.0f; // only to satisfy the compiler
2394+
bad_arch();
24002395
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24012396
}
24022397

@@ -2429,8 +2424,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
24292424
return dm4f.x*sumf_d - dm4f.y*sumf_m;
24302425

24312426
#else
2432-
assert(false);
2433-
return 0.0f; // only to satisfy the compiler
2427+
bad_arch();
24342428
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24352429
}
24362430

@@ -2460,8 +2454,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
24602454

24612455
return d*sumf;
24622456
#else
2463-
assert(false);
2464-
return 0.0f; // only to satisfy the compiler
2457+
bad_arch();
24652458
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24662459
}
24672460

@@ -2492,8 +2485,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
24922485
return d6 * sumf_d;
24932486

24942487
#else
2495-
assert(false);
2496-
return 0.0f; // only to satisfy the compiler
2488+
bad_arch();
24972489
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24982490
}
24992491

@@ -3359,8 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
33593351
return dall * sumf_d - dmin * sumf_m;
33603352

33613353
#else
3362-
assert(false);
3363-
return 0.0f; // only to satisfy the compiler
3354+
bad_arch();
33643355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
33653356

33663357
#endif
@@ -3543,8 +3534,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
35433534
return d * sumf_d;
35443535

35453536
#else
3546-
assert(false);
3547-
return 0.0f; // only to satisfy the compiler
3537+
bad_arch();
35483538
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
35493539

35503540
#endif
@@ -3954,7 +3944,7 @@ template <bool need_check> static __global__ void
39543944
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
39553945
#else
39563946
(void) vec_dot_q4_0_q8_1_mul_mat;
3957-
assert(false);
3947+
bad_arch();
39583948
#endif // __CUDA_ARCH__ >= CC_VOLTA
39593949
}
39603950

@@ -4023,7 +4013,7 @@ template <bool need_check> static __global__ void
40234013
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40244014
#else
40254015
(void) vec_dot_q4_1_q8_1_mul_mat;
4026-
assert(false);
4016+
bad_arch();
40274017
#endif // __CUDA_ARCH__ >= CC_VOLTA
40284018
}
40294019

@@ -4090,7 +4080,7 @@ template <bool need_check> static __global__ void
40904080
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40914081
#else
40924082
(void) vec_dot_q5_0_q8_1_mul_mat;
4093-
assert(false);
4083+
bad_arch();
40944084
#endif // __CUDA_ARCH__ >= CC_VOLTA
40954085
}
40964086

@@ -4157,7 +4147,7 @@ mul_mat_q5_1(
41574147
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
41584148
#else
41594149
(void) vec_dot_q5_1_q8_1_mul_mat;
4160-
assert(false);
4150+
bad_arch();
41614151
#endif // __CUDA_ARCH__ >= CC_VOLTA
41624152
}
41634153

@@ -4224,7 +4214,7 @@ template <bool need_check> static __global__ void
42244214
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
42254215
#else
42264216
(void) vec_dot_q8_0_q8_1_mul_mat;
4227-
assert(false);
4217+
bad_arch();
42284218
#endif // __CUDA_ARCH__ >= CC_VOLTA
42294219
}
42304220

@@ -4291,7 +4281,7 @@ mul_mat_q2_K(
42914281
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
42924282
#else
42934283
(void) vec_dot_q2_K_q8_1_mul_mat;
4294-
assert(false);
4284+
bad_arch();
42954285
#endif // __CUDA_ARCH__ >= CC_VOLTA
42964286
}
42974287

@@ -4360,7 +4350,7 @@ template <bool need_check> static __global__ void
43604350
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
43614351
#else
43624352
(void) vec_dot_q3_K_q8_1_mul_mat;
4363-
assert(false);
4353+
bad_arch();
43644354
#endif // __CUDA_ARCH__ >= CC_VOLTA
43654355
}
43664356

@@ -4429,7 +4419,7 @@ template <bool need_check> static __global__ void
44294419
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
44304420
#else
44314421
(void) vec_dot_q4_K_q8_1_mul_mat;
4432-
assert(false);
4422+
bad_arch();
44334423
#endif // __CUDA_ARCH__ >= CC_VOLTA
44344424
}
44354425

@@ -4496,7 +4486,7 @@ mul_mat_q5_K(
44964486
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
44974487
#else
44984488
(void) vec_dot_q5_K_q8_1_mul_mat;
4499-
assert(false);
4489+
bad_arch();
45004490
#endif // __CUDA_ARCH__ >= CC_VOLTA
45014491
}
45024492

@@ -4565,7 +4555,7 @@ template <bool need_check> static __global__ void
45654555
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
45664556
#else
45674557
(void) vec_dot_q6_K_q8_1_mul_mat;
4568-
assert(false);
4558+
bad_arch();
45694559
#endif // __CUDA_ARCH__ >= CC_VOLTA
45704560
}
45714561

0 commit comments

Comments
 (0)