Skip to content

Commit 59937e4

Browse files
committed
rename CC_TURING to CC_VOLTA
1 parent 62832c5 commit 59937e4

File tree

1 file changed

+45
-44
lines changed

1 file changed

+45
-44
lines changed

ggml-cuda.cu

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@
8080
#include "ggml.h"
8181

8282
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
83-
#define CC_TURING 700
83+
#define CC_VOLTA 700
8484
#define CC_OFFSET_AMD 1000000
85-
#define CC_RDNA2 CC_OFFSET_AMD + 1030
85+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
8686

8787
#if defined(GGML_USE_HIPBLAS)
8888
#define __CUDA_ARCH__ 1300
@@ -3553,7 +3553,7 @@ template <bool need_check> static __global__ void
35533553
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
35543554
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
35553555

3556-
#elif __CUDA_ARCH__ >= CC_TURING
3556+
#elif __CUDA_ARCH__ >= CC_VOLTA
35573557
const int mmq_x = MMQ_X_Q4_0_AMPERE;
35583558
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
35593559
const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3573,7 +3573,7 @@ template <bool need_check> static __global__ void
35733573
#else
35743574
(void) vec_dot_q4_0_q8_1_mul_mat;
35753575
assert(false);
3576-
#endif // __CUDA_ARCH__ >= CC_TURING
3576+
#endif // __CUDA_ARCH__ >= CC_VOLTA
35773577
}
35783578

35793579
#define MMQ_X_Q4_1_RDNA2 64
@@ -3594,9 +3594,9 @@ template <bool need_check> static __global__ void
35943594
#if defined(RDNA3) || defined(RDNA2)
35953595
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
35963596
#endif // defined(RDNA3) || defined(RDNA2)
3597-
#elif __CUDA_ARCH__ < CC_TURING
3597+
#elif __CUDA_ARCH__ < CC_VOLTA
35983598
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3599-
#endif // __CUDA_ARCH__ < CC_TURING
3599+
#endif // __CUDA_ARCH__ < CC_VOLTA
36003600
mul_mat_q4_1(
36013601
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
36023602
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3616,7 +3616,7 @@ template <bool need_check> static __global__ void
36163616
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
36173617
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
36183618

3619-
#elif __CUDA_ARCH__ >= CC_TURING
3619+
#elif __CUDA_ARCH__ >= CC_VOLTA
36203620
const int mmq_x = MMQ_X_Q4_1_AMPERE;
36213621
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
36223622
const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3636,7 +3636,7 @@ template <bool need_check> static __global__ void
36363636
#else
36373637
(void) vec_dot_q4_1_q8_1_mul_mat;
36383638
assert(false);
3639-
#endif // __CUDA_ARCH__ >= CC_TURING
3639+
#endif // __CUDA_ARCH__ >= CC_VOLTA
36403640
}
36413641

36423642
#define MMQ_X_Q5_0_RDNA2 64
@@ -3677,7 +3677,7 @@ template <bool need_check> static __global__ void
36773677
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
36783678
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
36793679

3680-
#elif __CUDA_ARCH__ >= CC_TURING
3680+
#elif __CUDA_ARCH__ >= CC_VOLTA
36813681
const int mmq_x = MMQ_X_Q5_0_AMPERE;
36823682
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
36833683
const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3697,7 +3697,7 @@ template <bool need_check> static __global__ void
36973697
#else
36983698
(void) vec_dot_q5_0_q8_1_mul_mat;
36993699
assert(false);
3700-
#endif // __CUDA_ARCH__ >= CC_TURING
3700+
#endif // __CUDA_ARCH__ >= CC_VOLTA
37013701
}
37023702

37033703
#define MMQ_X_Q5_1_RDNA2 64
@@ -3738,7 +3738,7 @@ mul_mat_q5_1(
37383738
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
37393739
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
37403740

3741-
#elif __CUDA_ARCH__ >= CC_TURING
3741+
#elif __CUDA_ARCH__ >= CC_VOLTA
37423742
const int mmq_x = MMQ_X_Q5_1_AMPERE;
37433743
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
37443744
const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3758,7 +3758,7 @@ mul_mat_q5_1(
37583758
#else
37593759
(void) vec_dot_q5_1_q8_1_mul_mat;
37603760
assert(false);
3761-
#endif // __CUDA_ARCH__ >= CC_TURING
3761+
#endif // __CUDA_ARCH__ >= CC_VOLTA
37623762
}
37633763

37643764
#define MMQ_X_Q8_0_RDNA2 64
@@ -3799,7 +3799,7 @@ template <bool need_check> static __global__ void
37993799
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
38003800
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
38013801

3802-
#elif __CUDA_ARCH__ >= CC_TURING
3802+
#elif __CUDA_ARCH__ >= CC_VOLTA
38033803
const int mmq_x = MMQ_X_Q8_0_AMPERE;
38043804
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
38053805
const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3819,7 +3819,7 @@ template <bool need_check> static __global__ void
38193819
#else
38203820
(void) vec_dot_q8_0_q8_1_mul_mat;
38213821
assert(false);
3822-
#endif // __CUDA_ARCH__ >= CC_TURING
3822+
#endif // __CUDA_ARCH__ >= CC_VOLTA
38233823
}
38243824

38253825
#define MMQ_X_Q2_K_RDNA2 64
@@ -3860,7 +3860,7 @@ mul_mat_q2_K(
38603860
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
38613861
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
38623862

3863-
#elif __CUDA_ARCH__ >= CC_TURING
3863+
#elif __CUDA_ARCH__ >= CC_VOLTA
38643864
const int mmq_x = MMQ_X_Q2_K_AMPERE;
38653865
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
38663866
const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3880,7 +3880,7 @@ mul_mat_q2_K(
38803880
#else
38813881
(void) vec_dot_q2_K_q8_1_mul_mat;
38823882
assert(false);
3883-
#endif // __CUDA_ARCH__ >= CC_TURING
3883+
#endif // __CUDA_ARCH__ >= CC_VOLTA
38843884
}
38853885

38863886
#define MMQ_X_Q3_K_RDNA2 128
@@ -3901,9 +3901,9 @@ template <bool need_check> static __global__ void
39013901
#if defined(RDNA3) || defined(RDNA2)
39023902
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
39033903
#endif // defined(RDNA3) || defined(RDNA2)
3904-
#elif __CUDA_ARCH__ < CC_TURING
3904+
#elif __CUDA_ARCH__ < CC_VOLTA
39053905
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3906-
#endif // __CUDA_ARCH__ < CC_TURING
3906+
#endif // __CUDA_ARCH__ < CC_VOLTA
39073907
mul_mat_q3_K(
39083908
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
39093909
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3923,7 +3923,7 @@ template <bool need_check> static __global__ void
39233923
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
39243924
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
39253925

3926-
#elif __CUDA_ARCH__ >= CC_TURING
3926+
#elif __CUDA_ARCH__ >= CC_VOLTA
39273927
const int mmq_x = MMQ_X_Q3_K_AMPERE;
39283928
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
39293929
const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3943,7 +3943,7 @@ template <bool need_check> static __global__ void
39433943
#else
39443944
(void) vec_dot_q3_K_q8_1_mul_mat;
39453945
assert(false);
3946-
#endif // __CUDA_ARCH__ >= CC_TURING
3946+
#endif // __CUDA_ARCH__ >= CC_VOLTA
39473947
}
39483948

39493949
#define MMQ_X_Q4_K_RDNA2 64
@@ -3964,9 +3964,9 @@ template <bool need_check> static __global__ void
39643964
#if defined(RDNA3) || defined(RDNA2)
39653965
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
39663966
#endif // defined(RDNA3) || defined(RDNA2)
3967-
#elif __CUDA_ARCH__ < CC_TURING
3967+
#elif __CUDA_ARCH__ < CC_VOLTA
39683968
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3969-
#endif // __CUDA_ARCH__ < CC_TURING
3969+
#endif // __CUDA_ARCH__ < CC_VOLTA
39703970
mul_mat_q4_K(
39713971
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
39723972
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3986,7 +3986,7 @@ template <bool need_check> static __global__ void
39863986
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
39873987
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
39883988

3989-
#elif __CUDA_ARCH__ >= CC_TURING
3989+
#elif __CUDA_ARCH__ >= CC_VOLTA
39903990
const int mmq_x = MMQ_X_Q4_K_AMPERE;
39913991
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
39923992
const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -4006,7 +4006,7 @@ template <bool need_check> static __global__ void
40064006
#else
40074007
(void) vec_dot_q4_K_q8_1_mul_mat;
40084008
assert(false);
4009-
#endif // __CUDA_ARCH__ >= CC_TURING
4009+
#endif // __CUDA_ARCH__ >= CC_VOLTA
40104010
}
40114011

40124012
#define MMQ_X_Q5_K_RDNA2 64
@@ -4047,7 +4047,7 @@ mul_mat_q5_K(
40474047
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
40484048
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40494049

4050-
#elif __CUDA_ARCH__ >= CC_TURING
4050+
#elif __CUDA_ARCH__ >= CC_VOLTA
40514051
const int mmq_x = MMQ_X_Q5_K_AMPERE;
40524052
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
40534053
const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4067,7 +4067,7 @@ mul_mat_q5_K(
40674067
#else
40684068
(void) vec_dot_q5_K_q8_1_mul_mat;
40694069
assert(false);
4070-
#endif // __CUDA_ARCH__ >= CC_TURING
4070+
#endif // __CUDA_ARCH__ >= CC_VOLTA
40714071
}
40724072

40734073
#define MMQ_X_Q6_K_RDNA2 64
@@ -4088,9 +4088,9 @@ template <bool need_check> static __global__ void
40884088
#if defined(RDNA3) || defined(RDNA2)
40894089
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
40904090
#endif // defined(RDNA3) || defined(RDNA2)
4091-
#elif __CUDA_ARCH__ < CC_TURING
4091+
#elif __CUDA_ARCH__ < CC_VOLTA
40924092
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
4093-
#endif // __CUDA_ARCH__ < CC_TURING
4093+
#endif // __CUDA_ARCH__ < CC_VOLTA
40944094
mul_mat_q6_K(
40954095
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
40964096
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4110,7 +4110,7 @@ template <bool need_check> static __global__ void
41104110
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
41114111
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
41124112

4113-
#elif __CUDA_ARCH__ >= CC_TURING
4113+
#elif __CUDA_ARCH__ >= CC_VOLTA
41144114
const int mmq_x = MMQ_X_Q6_K_AMPERE;
41154115
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
41164116
const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4130,7 +4130,7 @@ template <bool need_check> static __global__ void
41304130
#else
41314131
(void) vec_dot_q6_K_q8_1_mul_mat;
41324132
assert(false);
4133-
#endif // __CUDA_ARCH__ >= CC_TURING
4133+
#endif // __CUDA_ARCH__ >= CC_VOLTA
41344134
}
41354135

41364136
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4674,6 +4674,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
46744674
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
46754675
#endif
46764676
}
4677+
46774678
template<typename dst_t>
46784679
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
46794680
const int nb = k / QK_K;
@@ -4955,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
49554956
mmq_x = MMQ_X_Q4_0_RDNA1;
49564957
mmq_y = MMQ_Y_Q4_0_RDNA1;
49574958
nwarps = NWARPS_Q4_0_RDNA1;
4958-
} else if (compute_capability >= CC_TURING) {
4959+
} else if (compute_capability >= CC_VOLTA) {
49594960
mmq_x = MMQ_X_Q4_0_AMPERE;
49604961
mmq_y = MMQ_Y_Q4_0_AMPERE;
49614962
nwarps = NWARPS_Q4_0_AMPERE;
@@ -5000,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
50005001
mmq_x = MMQ_X_Q4_1_RDNA1;
50015002
mmq_y = MMQ_Y_Q4_1_RDNA1;
50025003
nwarps = NWARPS_Q4_1_RDNA1;
5003-
} else if (compute_capability >= CC_TURING) {
5004+
} else if (compute_capability >= CC_VOLTA) {
50045005
mmq_x = MMQ_X_Q4_1_AMPERE;
50055006
mmq_y = MMQ_Y_Q4_1_AMPERE;
50065007
nwarps = NWARPS_Q4_1_AMPERE;
@@ -5045,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
50455046
mmq_x = MMQ_X_Q5_0_RDNA1;
50465047
mmq_y = MMQ_Y_Q5_0_RDNA1;
50475048
nwarps = NWARPS_Q5_0_RDNA1;
5048-
} else if (compute_capability >= CC_TURING) {
5049+
} else if (compute_capability >= CC_VOLTA) {
50495050
mmq_x = MMQ_X_Q5_0_AMPERE;
50505051
mmq_y = MMQ_Y_Q5_0_AMPERE;
50515052
nwarps = NWARPS_Q5_0_AMPERE;
@@ -5090,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
50905091
mmq_x = MMQ_X_Q5_1_RDNA1;
50915092
mmq_y = MMQ_Y_Q5_1_RDNA1;
50925093
nwarps = NWARPS_Q5_1_RDNA1;
5093-
} else if (compute_capability >= CC_TURING) {
5094+
} else if (compute_capability >= CC_VOLTA) {
50945095
mmq_x = MMQ_X_Q5_1_AMPERE;
50955096
mmq_y = MMQ_Y_Q5_1_AMPERE;
50965097
nwarps = NWARPS_Q5_1_AMPERE;
@@ -5135,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
51355136
mmq_x = MMQ_X_Q8_0_RDNA1;
51365137
mmq_y = MMQ_Y_Q8_0_RDNA1;
51375138
nwarps = NWARPS_Q8_0_RDNA1;
5138-
} else if (compute_capability >= CC_TURING) {
5139+
} else if (compute_capability >= CC_VOLTA) {
51395140
mmq_x = MMQ_X_Q8_0_AMPERE;
51405141
mmq_y = MMQ_Y_Q8_0_AMPERE;
51415142
nwarps = NWARPS_Q8_0_AMPERE;
@@ -5180,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
51805181
mmq_x = MMQ_X_Q2_K_RDNA1;
51815182
mmq_y = MMQ_Y_Q2_K_RDNA1;
51825183
nwarps = NWARPS_Q2_K_RDNA1;
5183-
} else if (compute_capability >= CC_TURING) {
5184+
} else if (compute_capability >= CC_VOLTA) {
51845185
mmq_x = MMQ_X_Q2_K_AMPERE;
51855186
mmq_y = MMQ_Y_Q2_K_AMPERE;
51865187
nwarps = NWARPS_Q2_K_AMPERE;
@@ -5227,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
52275228
mmq_x = MMQ_X_Q3_K_RDNA1;
52285229
mmq_y = MMQ_Y_Q3_K_RDNA1;
52295230
nwarps = NWARPS_Q3_K_RDNA1;
5230-
} else if (compute_capability >= CC_TURING) {
5231+
} else if (compute_capability >= CC_VOLTA) {
52315232
mmq_x = MMQ_X_Q3_K_AMPERE;
52325233
mmq_y = MMQ_Y_Q3_K_AMPERE;
52335234
nwarps = NWARPS_Q3_K_AMPERE;
@@ -5273,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
52735274
mmq_x = MMQ_X_Q4_K_RDNA1;
52745275
mmq_y = MMQ_Y_Q4_K_RDNA1;
52755276
nwarps = NWARPS_Q4_K_RDNA1;
5276-
} else if (compute_capability >= CC_TURING) {
5277+
} else if (compute_capability >= CC_VOLTA) {
52775278
mmq_x = MMQ_X_Q4_K_AMPERE;
52785279
mmq_y = MMQ_Y_Q4_K_AMPERE;
52795280
nwarps = NWARPS_Q4_K_AMPERE;
@@ -5318,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
53185319
mmq_x = MMQ_X_Q5_K_RDNA1;
53195320
mmq_y = MMQ_Y_Q5_K_RDNA1;
53205321
nwarps = NWARPS_Q5_K_RDNA1;
5321-
} else if (compute_capability >= CC_TURING) {
5322+
} else if (compute_capability >= CC_VOLTA) {
53225323
mmq_x = MMQ_X_Q5_K_AMPERE;
53235324
mmq_y = MMQ_Y_Q5_K_AMPERE;
53245325
nwarps = NWARPS_Q5_K_AMPERE;
@@ -5363,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
53635364
mmq_x = MMQ_X_Q6_K_RDNA1;
53645365
mmq_y = MMQ_Y_Q6_K_RDNA1;
53655366
nwarps = NWARPS_Q6_K_RDNA1;
5366-
} else if (compute_capability >= CC_TURING) {
5367+
} else if (compute_capability >= CC_VOLTA) {
53675368
mmq_x = MMQ_X_Q6_K_AMPERE;
53685369
mmq_y = MMQ_Y_Q6_K_AMPERE;
53695370
nwarps = NWARPS_Q6_K_AMPERE;
@@ -5941,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
59415942
switch(type) {
59425943
case GGML_TYPE_Q4_0:
59435944
case GGML_TYPE_Q4_1:
5944-
return max_compute_capability >= CC_TURING ? 128 : 64;
5945+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
59455946
case GGML_TYPE_Q5_0:
59465947
case GGML_TYPE_Q5_1:
59475948
case GGML_TYPE_Q8_0:
@@ -5952,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
59525953
case GGML_TYPE_Q3_K:
59535954
case GGML_TYPE_Q4_K:
59545955
case GGML_TYPE_Q5_K:
5955-
return max_compute_capability >= CC_TURING ? 128 : 64;
5956+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
59565957
case GGML_TYPE_Q6_K:
59575958
return 64;
59585959
default:
@@ -6117,7 +6118,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
61176118

61186119
const int compute_capability = g_compute_capabilities[id];
61196120

6120-
if (compute_capability >= CC_TURING && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && ldc == row_diff) {
6121+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && ldc == row_diff) {
61216122
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
61226123
half * src0_as_f16 = nullptr;
61236124
size_t src0_as = 0;
@@ -6128,7 +6129,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
61286129
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
61296130
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
61306131
}
6131-
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (half *) src0_dd_i : src0_as_f16;
6132+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
61326133

61336134
half * src1_as_f16 = nullptr;
61346135
size_t src1_as = 0;

0 commit comments

Comments
 (0)