41
41
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
42
42
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
43
43
44
- #define CC_PASCAL 600
45
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
- #define CC_VOLTA 700
47
- #define CC_TURING 750
48
- #define CC_AMPERE 800
49
- #define CC_OFFSET_AMD 1000000
44
+ #define GGML_CUDA_CC_PASCAL 600
45
+ #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
+ #define GGML_CUDA_CC_VOLTA 700
47
+ #define GGML_CUDA_CC_TURING 750
48
+ #define GGML_CUDA_CC_AMPERE 800
49
+ #define GGML_CUDA_CC_OFFSET_AMD 1000000
50
50
51
51
// GCN/CNDA, wave size is 64
52
- #define CC_GCN4 (CC_OFFSET_AMD + 803 ) // Tonga, Fiji, Polaris, minimum for fast fp16
53
- #define CC_VEGA (CC_OFFSET_AMD + 900 ) // Vega56/64, minimum for fp16 dual issue
54
- #define CC_VEGA20 (CC_OFFSET_AMD + 906 ) // MI50/Radeon VII, minimum for dp4a
55
- #define CC_CDNA (CC_OFFSET_AMD + 908 ) // MI100, minimum for MFMA, acc registers
56
- #define CC_CDNA2 (CC_OFFSET_AMD + 910 ) // MI210, minimum acc register renameing
57
- #define CC_CDNA3 (CC_OFFSET_AMD + 942 ) // MI300
52
+ #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803 ) // Tonga, Fiji, Polaris, minimum for fast fp16
53
+ #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900 ) // Vega56/64, minimum for fp16 dual issue
54
+ #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906 ) // MI50/Radeon VII, minimum for dp4a
55
+ #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908 ) // MI100, minimum for MFMA, acc registers
56
+ #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910 ) // MI210, minimum acc register renameing
57
+ #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942 ) // MI300
58
58
59
59
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60
- #define CC_RDNA1 (CC_OFFSET_AMD + 1010 ) // RX 5000
61
- #define CC_RDNA2 (CC_OFFSET_AMD + 1030 ) // RX 6000, minimum for dp4a
62
- #define CC_RDNA3 (CC_OFFSET_AMD + 1100 ) // RX 7000, minimum for WMMA
60
+ #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010 ) // RX 5000
61
+ #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030 ) // RX 6000, minimum for dp4a
62
+ #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100 ) // RX 7000, minimum for WMMA
63
63
64
- #define CC_QY1 210
65
- #define CC_QY2 220
64
+ #define GGML_CUDA_CC_QY1 210
65
+ #define GGML_CUDA_CC_QY2 220
66
66
67
67
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
68
68
@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
131
131
typedef float2 dfloat2;
132
132
#endif // GGML_CUDA_F16
133
133
134
- #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
134
+ #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
135
135
#define FP16_AVAILABLE
136
- #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
136
+ #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
137
137
138
138
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
139
139
#define FAST_FP16_AVAILABLE
140
140
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
141
141
142
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
142
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
143
143
#define FP16_MMA_AVAILABLE
144
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
144
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
145
145
146
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
146
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
147
147
#define INT8_MMA_AVAILABLE
148
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
148
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
149
149
150
- #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1 )
150
+ #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
151
151
#define FLASH_ATTN_AVAILABLE
152
- #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1 )
152
+ #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
153
153
154
154
static constexpr bool fast_fp16_available (const int cc) {
155
- return cc >= CC_PASCAL && cc != 610 ;
155
+ return cc >= GGML_CUDA_CC_PASCAL && cc != 610 ;
156
156
}
157
157
158
158
static constexpr bool fp16_mma_available (const int cc) {
159
- return cc < CC_OFFSET_AMD && cc >= CC_VOLTA ;
159
+ return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ;
160
160
}
161
161
162
162
static constexpr bool int8_mma_available (const int cc) {
163
- return cc < CC_OFFSET_AMD && cc >= CC_TURING ;
163
+ return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING ;
164
164
}
165
165
166
166
[[noreturn]]
@@ -187,15 +187,15 @@ static __device__ void no_device_code(
187
187
#endif // __CUDA_ARCH__
188
188
189
189
static __device__ __forceinline__ int warp_reduce_sum (int x) {
190
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
190
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
191
191
return __reduce_add_sync (0xffffffff , x);
192
192
#else
193
193
#pragma unroll
194
194
for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
195
195
x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
196
196
}
197
197
return x;
198
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
198
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
199
199
}
200
200
201
201
static __device__ __forceinline__ float warp_reduce_sum (float x) {
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
284
284
}
285
285
286
286
static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
287
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
287
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
288
288
#pragma unroll
289
289
for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
290
290
x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
293
293
#else
294
294
GGML_UNUSED (x);
295
295
NO_DEVICE_CODE;
296
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
296
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
297
297
}
298
298
299
299
#if CUDART_VERSION < CUDART_HMASK
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
333
333
334
334
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
335
335
336
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
336
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
337
337
return __dp4a (a, b, c);
338
- #else // __CUDA_ARCH__ >= MIN_CC_DP4A
338
+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
339
339
const int8_t * a8 = (const int8_t *) &a;
340
340
const int8_t * b8 = (const int8_t *) &b;
341
341
return c + a8[0 ]*b8[0 ] + a8[1 ]*b8[1 ] + a8[2 ]*b8[2 ] + a8[3 ]*b8[3 ];
342
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
342
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
343
343
344
344
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
345
345
}
0 commit comments