Skip to content

Commit 750cb3e

Browse files
authored
CUDA: rename macros to avoid conflicts with WinAPI (#10736)
* Renames NVIDIA GPU-architecture flags to avoid name clashes with WinAPI. (e.g. CC_PASCAL, GPU architecture or WinAPI pascal compiler flag?) * Reverts erroneous rename in SYCL-code. * Renames GGML_CUDA_MIN_CC_DP4A to GGML_CUDA_CC_DP4A. * Renames the rest of the compute capability macros for consistency.
1 parent a86ad84 commit 750cb3e

File tree

10 files changed

+69
-71
lines changed

10 files changed

+69
-71
lines changed

ggml/src/ggml-common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
473473
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
474474
GGML_TABLE_END()
475475

476-
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
476+
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
477477
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
478478
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
479479
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,

ggml/src/ggml-cuda/common.cuh

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -41,28 +41,28 @@
4141
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
4242
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
4343

44-
#define CC_PASCAL 600
45-
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46-
#define CC_VOLTA 700
47-
#define CC_TURING 750
48-
#define CC_AMPERE 800
49-
#define CC_OFFSET_AMD 1000000
44+
#define GGML_CUDA_CC_PASCAL 600
45+
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46+
#define GGML_CUDA_CC_VOLTA 700
47+
#define GGML_CUDA_CC_TURING 750
48+
#define GGML_CUDA_CC_AMPERE 800
49+
#define GGML_CUDA_CC_OFFSET_AMD 1000000
5050

5151
// GCN/CNDA, wave size is 64
52-
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53-
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54-
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55-
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56-
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57-
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
52+
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53+
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54+
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55+
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56+
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57+
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
5858

5959
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60-
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
61-
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62-
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
60+
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
61+
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62+
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
6363

64-
#define CC_QY1 210
65-
#define CC_QY2 220
64+
#define GGML_CUDA_CC_QY1 210
65+
#define GGML_CUDA_CC_QY2 220
6666

6767
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
6868

@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
131131
typedef float2 dfloat2;
132132
#endif // GGML_CUDA_F16
133133

134-
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
134+
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
135135
#define FP16_AVAILABLE
136-
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
136+
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
137137

138138
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
139139
#define FAST_FP16_AVAILABLE
140140
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
141141

142-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
142+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
143143
#define FP16_MMA_AVAILABLE
144-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
144+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
145145

146-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
146+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
147147
#define INT8_MMA_AVAILABLE
148-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
148+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
149149

150-
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
150+
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
151151
#define FLASH_ATTN_AVAILABLE
152-
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
152+
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
153153

154154
static constexpr bool fast_fp16_available(const int cc) {
155-
return cc >= CC_PASCAL && cc != 610;
155+
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
156156
}
157157

158158
static constexpr bool fp16_mma_available(const int cc) {
159-
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
159+
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
160160
}
161161

162162
static constexpr bool int8_mma_available(const int cc) {
163-
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
163+
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
164164
}
165165

166166
[[noreturn]]
@@ -187,15 +187,15 @@ static __device__ void no_device_code(
187187
#endif // __CUDA_ARCH__
188188

189189
static __device__ __forceinline__ int warp_reduce_sum(int x) {
190-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
190+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
191191
return __reduce_add_sync(0xffffffff, x);
192192
#else
193193
#pragma unroll
194194
for (int offset = 16; offset > 0; offset >>= 1) {
195195
x += __shfl_xor_sync(0xffffffff, x, offset, 32);
196196
}
197197
return x;
198-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
198+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
199199
}
200200

201201
static __device__ __forceinline__ float warp_reduce_sum(float x) {
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
284284
}
285285

286286
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
287-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
287+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
288288
#pragma unroll
289289
for (int offset = 16; offset > 0; offset >>= 1) {
290290
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
293293
#else
294294
GGML_UNUSED(x);
295295
NO_DEVICE_CODE;
296-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
296+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
297297
}
298298

299299
#if CUDART_VERSION < CUDART_HMASK
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
333333

334334
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
335335

336-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
336+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
337337
return __dp4a(a, b, c);
338-
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
338+
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
339339
const int8_t * a8 = (const int8_t *) &a;
340340
const int8_t * b8 = (const int8_t *) &b;
341341
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
342-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
342+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
343343

344344
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
345345
}

ggml/src/ggml-cuda/convert.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
2626

2727
template <bool need_check>
2828
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
29-
#if __CUDA_ARCH__ >= CC_PASCAL
29+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
3030
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
3131

3232
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
@@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
6464
GGML_UNUSED(y);
6565
GGML_UNUSED(k);
6666
NO_DEVICE_CODE;
67-
#endif // __CUDA_ARCH__ >= CC_PASCAL
67+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
6868
}
6969

7070
template<typename dst_t>
@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
599599
case GGML_TYPE_Q5_1:
600600
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
601601
case GGML_TYPE_Q8_0:
602-
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
602+
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
603603
return dequantize_block_q8_0_f16_cuda;
604604
}
605605
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;

ggml/src/ggml-cuda/fattn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
304304
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
305305

306306
// On AMD the tile kernels perform poorly, use the vec kernel instead:
307-
if (cc >= CC_OFFSET_AMD) {
307+
if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
308308
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
309309
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
310310
} else {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
177177
info.devices[id].smpb = prop.sharedMemPerBlock;
178178
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
179179
info.devices[id].smpbo = prop.sharedMemPerBlock;
180-
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
180+
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
181181
#else
182182
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
183183
info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas(
10811081

10821082
const int compute_capability = ggml_cuda_info().devices[id].cc;
10831083

1084-
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1084+
if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
10851085
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
10861086
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
10871087
if (src0->type != GGML_TYPE_F16) {
@@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas(
11081108
const half beta_f16 = 0.0f;
11091109

11101110
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1111-
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
1111+
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
11121112
cu_compute_type = CUBLAS_COMPUTE_32F;
11131113
}
11141114

@@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
16121612
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
16131613
cudaDataType_t cu_data_type = CUDA_R_16F;
16141614

1615-
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
1615+
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
16161616
cu_compute_type = CUBLAS_COMPUTE_32F;
16171617
}
16181618

@@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
23572357
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
23582358

23592359
if (cuda_ctx->cuda_graph->graph == nullptr) {
2360-
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2360+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
23612361
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
23622362
#ifndef NDEBUG
23632363
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
@@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30283028
return true;
30293029
}
30303030
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3031-
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3031+
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
30323032
}
30333033
case GGML_OP_CROSS_ENTROPY_LOSS:
30343034
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:

ggml/src/ggml-cuda/mma.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ struct mma_int_C_I16J8 {
171171

172172
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
173173
#ifdef INT8_MMA_AVAILABLE
174-
#if __CUDA_ARCH__ >= CC_AMPERE
174+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
175175
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
176176
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
177177
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
@@ -183,7 +183,7 @@ struct mma_int_C_I16J8 {
183183
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
184184
: "+r"(x[2]), "+r"(x[3])
185185
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
186-
#endif // __CUDA_ARCH__ >= CC_AMPERE
186+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
187187
#else
188188
GGML_UNUSED(mma_A);
189189
GGML_UNUSED(mma_B);
@@ -193,7 +193,7 @@ struct mma_int_C_I16J8 {
193193

194194
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
195195
#ifdef INT8_MMA_AVAILABLE
196-
#if __CUDA_ARCH__ >= CC_AMPERE
196+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
197197
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
198198
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
199199
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
@@ -211,7 +211,7 @@ struct mma_int_C_I16J8 {
211211
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
212212
: "+r"(x[2]), "+r"(x[3])
213213
: "r"(mma_A.x[3]), "r"(mma_B.x[1]));
214-
#endif // __CUDA_ARCH__ >= CC_AMPERE
214+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
215215
#else
216216
GGML_UNUSED(mma_A);
217217
GGML_UNUSED(mma_B);

ggml/src/ggml-cuda/mmq.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q(
2727
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
2828
// Also its fixup needs to allocate a temporary buffer in the memory pool.
2929
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30-
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
30+
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
3131
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
3232

3333
switch (src0->type) {
@@ -136,17 +136,17 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
136136
return true;
137137
}
138138

139-
if (cc < MIN_CC_DP4A) {
139+
if (cc < GGML_CUDA_CC_DP4A) {
140140
return false;
141141
}
142142

143143
#ifdef GGML_CUDA_FORCE_MMQ
144144
return true;
145145
#endif //GGML_CUDA_FORCE_MMQ
146146

147-
if (cc < CC_OFFSET_AMD) {
148-
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
147+
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
148+
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149149
}
150150

151-
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
151+
return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
152152
}

0 commit comments

Comments
 (0)