Skip to content

Commit 4d0598e

Browse files
authored
HIP: add GGML_CUDA_CC_IS_* for amd familys as increasing cc archtectures for amd gpus are not supersets of eatch other (#11601)
This fixes a bug where RDNA1 gpus other than gfx1010 where not handled correctly
1 parent 90f9b88 commit 4d0598e

File tree

4 files changed

+11
-4
lines changed

4 files changed

+11
-4
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@
6161
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
6262
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
6363

64+
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
65+
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
66+
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
67+
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
68+
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
69+
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
70+
6471
#define GGML_CUDA_CC_QY1 210
6572
#define GGML_CUDA_CC_QY2 220
6673

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,7 +1205,7 @@ static void ggml_cuda_op_mul_mat_cublas(
12051205

12061206
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
12071207

1208-
if (compute_capability == GGML_CUDA_CC_CDNA) {
1208+
if (GGML_CUDA_CC_IS_CDNA(compute_capability)) {
12091209
const float alpha = 1.0f;
12101210
const float beta = 0.0f;
12111211
CUBLAS_CHECK(
@@ -1750,7 +1750,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17501750
beta = &beta_f32;
17511751
}
17521752

1753-
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
1753+
if (GGML_CUDA_CC_IS_CDNA(ggml_cuda_info().devices[ctx.device].cc)) {
17541754
cu_compute_type = CUBLAS_COMPUTE_32F;
17551755
alpha = &alpha_f32;
17561756
beta = &beta_f32;

ggml/src/ggml-cuda/mmq.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
148148
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149149
}
150150

151-
return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
151+
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
152152
}

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
120120
}
121121

122122
static constexpr int get_mmq_y_host(const int cc) {
123-
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
123+
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
124124
}
125125

126126
static constexpr __device__ int get_mmq_y_device() {

0 commit comments

Comments
 (0)