80
80
#include " ggml.h"
81
81
82
82
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
83
- #define CC_TURING 700
83
+ #define CC_VOLTA 700
84
84
#define CC_OFFSET_AMD 1000000
85
- #define CC_RDNA2 CC_OFFSET_AMD + 1030
85
+ #define CC_RDNA2 ( CC_OFFSET_AMD + 1030 )
86
86
87
87
#if defined(GGML_USE_HIPBLAS)
88
88
#define __CUDA_ARCH__ 1300
@@ -3553,7 +3553,7 @@ template <bool need_check> static __global__ void
3553
3553
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3554
3554
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3555
3555
3556
- #elif __CUDA_ARCH__ >= CC_TURING
3556
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3557
3557
const int mmq_x = MMQ_X_Q4_0_AMPERE;
3558
3558
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3559
3559
const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3573,7 +3573,7 @@ template <bool need_check> static __global__ void
3573
3573
#else
3574
3574
(void ) vec_dot_q4_0_q8_1_mul_mat;
3575
3575
assert (false );
3576
- #endif // __CUDA_ARCH__ >= CC_TURING
3576
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3577
3577
}
3578
3578
3579
3579
#define MMQ_X_Q4_1_RDNA2 64
@@ -3594,9 +3594,9 @@ template <bool need_check> static __global__ void
3594
3594
#if defined(RDNA3) || defined(RDNA2)
3595
3595
__launch_bounds__ (WARP_SIZE*NWARPS_Q4_1_RDNA2, 2 )
3596
3596
#endif // defined(RDNA3) || defined(RDNA2)
3597
- #elif __CUDA_ARCH__ < CC_TURING
3597
+ #elif __CUDA_ARCH__ < CC_VOLTA
3598
3598
__launch_bounds__ (WARP_SIZE*NWARPS_Q4_1_PASCAL, 2 )
3599
- #endif // __CUDA_ARCH__ < CC_TURING
3599
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3600
3600
mul_mat_q4_1 (
3601
3601
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3602
3602
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3616,7 +3616,7 @@ template <bool need_check> static __global__ void
3616
3616
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3617
3617
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3618
3618
3619
- #elif __CUDA_ARCH__ >= CC_TURING
3619
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3620
3620
const int mmq_x = MMQ_X_Q4_1_AMPERE;
3621
3621
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3622
3622
const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3636,7 +3636,7 @@ template <bool need_check> static __global__ void
3636
3636
#else
3637
3637
(void ) vec_dot_q4_1_q8_1_mul_mat;
3638
3638
assert (false );
3639
- #endif // __CUDA_ARCH__ >= CC_TURING
3639
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3640
3640
}
3641
3641
3642
3642
#define MMQ_X_Q5_0_RDNA2 64
@@ -3677,7 +3677,7 @@ template <bool need_check> static __global__ void
3677
3677
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3678
3678
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3679
3679
3680
- #elif __CUDA_ARCH__ >= CC_TURING
3680
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3681
3681
const int mmq_x = MMQ_X_Q5_0_AMPERE;
3682
3682
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3683
3683
const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3697,7 +3697,7 @@ template <bool need_check> static __global__ void
3697
3697
#else
3698
3698
(void ) vec_dot_q5_0_q8_1_mul_mat;
3699
3699
assert (false );
3700
- #endif // __CUDA_ARCH__ >= CC_TURING
3700
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3701
3701
}
3702
3702
3703
3703
#define MMQ_X_Q5_1_RDNA2 64
@@ -3738,7 +3738,7 @@ mul_mat_q5_1(
3738
3738
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3739
3739
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3740
3740
3741
- #elif __CUDA_ARCH__ >= CC_TURING
3741
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3742
3742
const int mmq_x = MMQ_X_Q5_1_AMPERE;
3743
3743
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3744
3744
const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3758,7 +3758,7 @@ mul_mat_q5_1(
3758
3758
#else
3759
3759
(void ) vec_dot_q5_1_q8_1_mul_mat;
3760
3760
assert (false );
3761
- #endif // __CUDA_ARCH__ >= CC_TURING
3761
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3762
3762
}
3763
3763
3764
3764
#define MMQ_X_Q8_0_RDNA2 64
@@ -3799,7 +3799,7 @@ template <bool need_check> static __global__ void
3799
3799
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3800
3800
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3801
3801
3802
- #elif __CUDA_ARCH__ >= CC_TURING
3802
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3803
3803
const int mmq_x = MMQ_X_Q8_0_AMPERE;
3804
3804
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3805
3805
const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3819,7 +3819,7 @@ template <bool need_check> static __global__ void
3819
3819
#else
3820
3820
(void ) vec_dot_q8_0_q8_1_mul_mat;
3821
3821
assert (false );
3822
- #endif // __CUDA_ARCH__ >= CC_TURING
3822
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3823
3823
}
3824
3824
3825
3825
#define MMQ_X_Q2_K_RDNA2 64
@@ -3860,7 +3860,7 @@ mul_mat_q2_K(
3860
3860
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3861
3861
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3862
3862
3863
- #elif __CUDA_ARCH__ >= CC_TURING
3863
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3864
3864
const int mmq_x = MMQ_X_Q2_K_AMPERE;
3865
3865
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3866
3866
const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3880,7 +3880,7 @@ mul_mat_q2_K(
3880
3880
#else
3881
3881
(void ) vec_dot_q2_K_q8_1_mul_mat;
3882
3882
assert (false );
3883
- #endif // __CUDA_ARCH__ >= CC_TURING
3883
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3884
3884
}
3885
3885
3886
3886
#define MMQ_X_Q3_K_RDNA2 128
@@ -3901,9 +3901,9 @@ template <bool need_check> static __global__ void
3901
3901
#if defined(RDNA3) || defined(RDNA2)
3902
3902
__launch_bounds__ (WARP_SIZE*NWARPS_Q3_K_RDNA2, 2 )
3903
3903
#endif // defined(RDNA3) || defined(RDNA2)
3904
- #elif __CUDA_ARCH__ < CC_TURING
3904
+ #elif __CUDA_ARCH__ < CC_VOLTA
3905
3905
__launch_bounds__ (WARP_SIZE*NWARPS_Q3_K_PASCAL, 2 )
3906
- #endif // __CUDA_ARCH__ < CC_TURING
3906
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3907
3907
mul_mat_q3_K (
3908
3908
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3909
3909
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3923,7 +3923,7 @@ template <bool need_check> static __global__ void
3923
3923
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3924
3924
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3925
3925
3926
- #elif __CUDA_ARCH__ >= CC_TURING
3926
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3927
3927
const int mmq_x = MMQ_X_Q3_K_AMPERE;
3928
3928
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3929
3929
const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3943,7 +3943,7 @@ template <bool need_check> static __global__ void
3943
3943
#else
3944
3944
(void ) vec_dot_q3_K_q8_1_mul_mat;
3945
3945
assert (false );
3946
- #endif // __CUDA_ARCH__ >= CC_TURING
3946
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3947
3947
}
3948
3948
3949
3949
#define MMQ_X_Q4_K_RDNA2 64
@@ -3964,9 +3964,9 @@ template <bool need_check> static __global__ void
3964
3964
#if defined(RDNA3) || defined(RDNA2)
3965
3965
__launch_bounds__ (WARP_SIZE*NWARPS_Q4_K_RDNA2, 2 )
3966
3966
#endif // defined(RDNA3) || defined(RDNA2)
3967
- #elif __CUDA_ARCH__ < CC_TURING
3967
+ #elif __CUDA_ARCH__ < CC_VOLTA
3968
3968
__launch_bounds__ (WARP_SIZE*NWARPS_Q4_K_PASCAL, 2 )
3969
- #endif // __CUDA_ARCH__ < CC_TURING
3969
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3970
3970
mul_mat_q4_K (
3971
3971
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3972
3972
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3986,7 +3986,7 @@ template <bool need_check> static __global__ void
3986
3986
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3987
3987
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3988
3988
3989
- #elif __CUDA_ARCH__ >= CC_TURING
3989
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3990
3990
const int mmq_x = MMQ_X_Q4_K_AMPERE;
3991
3991
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3992
3992
const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -4006,7 +4006,7 @@ template <bool need_check> static __global__ void
4006
4006
#else
4007
4007
(void ) vec_dot_q4_K_q8_1_mul_mat;
4008
4008
assert (false );
4009
- #endif // __CUDA_ARCH__ >= CC_TURING
4009
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4010
4010
}
4011
4011
4012
4012
#define MMQ_X_Q5_K_RDNA2 64
@@ -4047,7 +4047,7 @@ mul_mat_q5_K(
4047
4047
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4048
4048
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4049
4049
4050
- #elif __CUDA_ARCH__ >= CC_TURING
4050
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4051
4051
const int mmq_x = MMQ_X_Q5_K_AMPERE;
4052
4052
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
4053
4053
const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4067,7 +4067,7 @@ mul_mat_q5_K(
4067
4067
#else
4068
4068
(void ) vec_dot_q5_K_q8_1_mul_mat;
4069
4069
assert (false );
4070
- #endif // __CUDA_ARCH__ >= CC_TURING
4070
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4071
4071
}
4072
4072
4073
4073
#define MMQ_X_Q6_K_RDNA2 64
@@ -4088,9 +4088,9 @@ template <bool need_check> static __global__ void
4088
4088
#if defined(RDNA3) || defined(RDNA2)
4089
4089
__launch_bounds__ (WARP_SIZE*NWARPS_Q6_K_RDNA2, 2 )
4090
4090
#endif // defined(RDNA3) || defined(RDNA2)
4091
- #elif __CUDA_ARCH__ < CC_TURING
4091
+ #elif __CUDA_ARCH__ < CC_VOLTA
4092
4092
__launch_bounds__ (WARP_SIZE*NWARPS_Q6_K_PASCAL, 2 )
4093
- #endif // __CUDA_ARCH__ < CC_TURING
4093
+ #endif // __CUDA_ARCH__ < CC_VOLTA
4094
4094
mul_mat_q6_K (
4095
4095
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
4096
4096
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4110,7 +4110,7 @@ template <bool need_check> static __global__ void
4110
4110
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4111
4111
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4112
4112
4113
- #elif __CUDA_ARCH__ >= CC_TURING
4113
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4114
4114
const int mmq_x = MMQ_X_Q6_K_AMPERE;
4115
4115
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
4116
4116
const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4130,7 +4130,7 @@ template <bool need_check> static __global__ void
4130
4130
#else
4131
4131
(void ) vec_dot_q6_K_q8_1_mul_mat;
4132
4132
assert (false );
4133
- #endif // __CUDA_ARCH__ >= CC_TURING
4133
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4134
4134
}
4135
4135
4136
4136
template <int qk, int qi, typename block_q_t , int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4674,6 +4674,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
4674
4674
dequantize_block_q5_K<<<nb, 32 , 0 , stream>>> (vx, y);
4675
4675
#endif
4676
4676
}
4677
+
4677
4678
template <typename dst_t >
4678
4679
static void dequantize_row_q6_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4679
4680
const int nb = k / QK_K;
@@ -4955,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4955
4956
mmq_x = MMQ_X_Q4_0_RDNA1;
4956
4957
mmq_y = MMQ_Y_Q4_0_RDNA1;
4957
4958
nwarps = NWARPS_Q4_0_RDNA1;
4958
- } else if (compute_capability >= CC_TURING ) {
4959
+ } else if (compute_capability >= CC_VOLTA ) {
4959
4960
mmq_x = MMQ_X_Q4_0_AMPERE;
4960
4961
mmq_y = MMQ_Y_Q4_0_AMPERE;
4961
4962
nwarps = NWARPS_Q4_0_AMPERE;
@@ -5000,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
5000
5001
mmq_x = MMQ_X_Q4_1_RDNA1;
5001
5002
mmq_y = MMQ_Y_Q4_1_RDNA1;
5002
5003
nwarps = NWARPS_Q4_1_RDNA1;
5003
- } else if (compute_capability >= CC_TURING ) {
5004
+ } else if (compute_capability >= CC_VOLTA ) {
5004
5005
mmq_x = MMQ_X_Q4_1_AMPERE;
5005
5006
mmq_y = MMQ_Y_Q4_1_AMPERE;
5006
5007
nwarps = NWARPS_Q4_1_AMPERE;
@@ -5045,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5045
5046
mmq_x = MMQ_X_Q5_0_RDNA1;
5046
5047
mmq_y = MMQ_Y_Q5_0_RDNA1;
5047
5048
nwarps = NWARPS_Q5_0_RDNA1;
5048
- } else if (compute_capability >= CC_TURING ) {
5049
+ } else if (compute_capability >= CC_VOLTA ) {
5049
5050
mmq_x = MMQ_X_Q5_0_AMPERE;
5050
5051
mmq_y = MMQ_Y_Q5_0_AMPERE;
5051
5052
nwarps = NWARPS_Q5_0_AMPERE;
@@ -5090,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
5090
5091
mmq_x = MMQ_X_Q5_1_RDNA1;
5091
5092
mmq_y = MMQ_Y_Q5_1_RDNA1;
5092
5093
nwarps = NWARPS_Q5_1_RDNA1;
5093
- } else if (compute_capability >= CC_TURING ) {
5094
+ } else if (compute_capability >= CC_VOLTA ) {
5094
5095
mmq_x = MMQ_X_Q5_1_AMPERE;
5095
5096
mmq_y = MMQ_Y_Q5_1_AMPERE;
5096
5097
nwarps = NWARPS_Q5_1_AMPERE;
@@ -5135,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
5135
5136
mmq_x = MMQ_X_Q8_0_RDNA1;
5136
5137
mmq_y = MMQ_Y_Q8_0_RDNA1;
5137
5138
nwarps = NWARPS_Q8_0_RDNA1;
5138
- } else if (compute_capability >= CC_TURING ) {
5139
+ } else if (compute_capability >= CC_VOLTA ) {
5139
5140
mmq_x = MMQ_X_Q8_0_AMPERE;
5140
5141
mmq_y = MMQ_Y_Q8_0_AMPERE;
5141
5142
nwarps = NWARPS_Q8_0_AMPERE;
@@ -5180,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
5180
5181
mmq_x = MMQ_X_Q2_K_RDNA1;
5181
5182
mmq_y = MMQ_Y_Q2_K_RDNA1;
5182
5183
nwarps = NWARPS_Q2_K_RDNA1;
5183
- } else if (compute_capability >= CC_TURING ) {
5184
+ } else if (compute_capability >= CC_VOLTA ) {
5184
5185
mmq_x = MMQ_X_Q2_K_AMPERE;
5185
5186
mmq_y = MMQ_Y_Q2_K_AMPERE;
5186
5187
nwarps = NWARPS_Q2_K_AMPERE;
@@ -5227,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
5227
5228
mmq_x = MMQ_X_Q3_K_RDNA1;
5228
5229
mmq_y = MMQ_Y_Q3_K_RDNA1;
5229
5230
nwarps = NWARPS_Q3_K_RDNA1;
5230
- } else if (compute_capability >= CC_TURING ) {
5231
+ } else if (compute_capability >= CC_VOLTA ) {
5231
5232
mmq_x = MMQ_X_Q3_K_AMPERE;
5232
5233
mmq_y = MMQ_Y_Q3_K_AMPERE;
5233
5234
nwarps = NWARPS_Q3_K_AMPERE;
@@ -5273,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
5273
5274
mmq_x = MMQ_X_Q4_K_RDNA1;
5274
5275
mmq_y = MMQ_Y_Q4_K_RDNA1;
5275
5276
nwarps = NWARPS_Q4_K_RDNA1;
5276
- } else if (compute_capability >= CC_TURING ) {
5277
+ } else if (compute_capability >= CC_VOLTA ) {
5277
5278
mmq_x = MMQ_X_Q4_K_AMPERE;
5278
5279
mmq_y = MMQ_Y_Q4_K_AMPERE;
5279
5280
nwarps = NWARPS_Q4_K_AMPERE;
@@ -5318,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
5318
5319
mmq_x = MMQ_X_Q5_K_RDNA1;
5319
5320
mmq_y = MMQ_Y_Q5_K_RDNA1;
5320
5321
nwarps = NWARPS_Q5_K_RDNA1;
5321
- } else if (compute_capability >= CC_TURING ) {
5322
+ } else if (compute_capability >= CC_VOLTA ) {
5322
5323
mmq_x = MMQ_X_Q5_K_AMPERE;
5323
5324
mmq_y = MMQ_Y_Q5_K_AMPERE;
5324
5325
nwarps = NWARPS_Q5_K_AMPERE;
@@ -5363,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
5363
5364
mmq_x = MMQ_X_Q6_K_RDNA1;
5364
5365
mmq_y = MMQ_Y_Q6_K_RDNA1;
5365
5366
nwarps = NWARPS_Q6_K_RDNA1;
5366
- } else if (compute_capability >= CC_TURING ) {
5367
+ } else if (compute_capability >= CC_VOLTA ) {
5367
5368
mmq_x = MMQ_X_Q6_K_AMPERE;
5368
5369
mmq_y = MMQ_Y_Q6_K_AMPERE;
5369
5370
nwarps = NWARPS_Q6_K_AMPERE;
@@ -5941,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
5941
5942
switch (type) {
5942
5943
case GGML_TYPE_Q4_0:
5943
5944
case GGML_TYPE_Q4_1:
5944
- return max_compute_capability >= CC_TURING ? 128 : 64 ;
5945
+ return max_compute_capability >= CC_VOLTA ? 128 : 64 ;
5945
5946
case GGML_TYPE_Q5_0:
5946
5947
case GGML_TYPE_Q5_1:
5947
5948
case GGML_TYPE_Q8_0:
@@ -5952,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
5952
5953
case GGML_TYPE_Q3_K:
5953
5954
case GGML_TYPE_Q4_K:
5954
5955
case GGML_TYPE_Q5_K:
5955
- return max_compute_capability >= CC_TURING ? 128 : 64 ;
5956
+ return max_compute_capability >= CC_VOLTA ? 128 : 64 ;
5956
5957
case GGML_TYPE_Q6_K:
5957
5958
return 64 ;
5958
5959
default :
@@ -6117,7 +6118,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6117
6118
6118
6119
const int compute_capability = g_compute_capabilities[id];
6119
6120
6120
- if (compute_capability >= CC_TURING && (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && ldc == row_diff) {
6121
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && ldc == row_diff) {
6121
6122
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6122
6123
half * src0_as_f16 = nullptr ;
6123
6124
size_t src0_as = 0 ;
@@ -6128,7 +6129,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6128
6129
src0_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src0_as);
6129
6130
to_fp16_cuda (src0_dd_i, src0_as_f16, ne, stream);
6130
6131
}
6131
- const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (half *) src0_dd_i : src0_as_f16;
6132
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
6132
6133
6133
6134
half * src1_as_f16 = nullptr ;
6134
6135
size_t src1_as = 0 ;
0 commit comments