@@ -154,16 +154,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
154
154
GGML_ASSERT (info.device_count <= GGML_CUDA_MAX_DEVICES);
155
155
156
156
int64_t total_vram = 0 ;
157
- // #if defined( GGML_CUDA_FORCE_MMQ)
158
- // GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
157
+ // #ifdef GGML_CUDA_FORCE_MMQ
158
+ // GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
159
159
// #else
160
- // GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
161
- // #endif
162
- // #if defined(CUDA_USE_TENSOR_CORES)
163
- // GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES : yes\n", __func__);
160
+ // GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
161
+ // #endif // GGML_CUDA_FORCE_MMQ
162
+ // #ifdef GGML_CUDA_FORCE_CUBLAS
163
+ // GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS : yes\n", __func__);
164
164
// #else
165
- // GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES : no\n", __func__);
166
- // #endif
165
+ // GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS : no\n", __func__);
166
+ // #endif // GGML_CUDA_FORCE_CUBLAS
167
167
GGML_CUDA_LOG_INFO (" %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, info.device_count );
168
168
for (int id = 0 ; id < info.device_count ; ++id) {
169
169
int device_vmm = 0 ;
@@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1873
1873
static void ggml_cuda_mul_mat (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1874
1874
const bool split = ggml_backend_buffer_is_cuda_split (src0->buffer );
1875
1875
1876
- int64_t min_compute_capability = INT_MAX;
1876
+ bool use_dequantize_mul_mat_vec = (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16)
1877
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1878
+ && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 && src1->ne [1 ] == 1 ;
1879
+ bool use_mul_mat_vec_q = ggml_is_quantized (src0->type )
1880
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1881
+ && src1->ne [1 ] <= MMVQ_MAX_BATCH_SIZE;
1882
+ bool use_mul_mat_q = ggml_is_quantized (src0->type )
1883
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1884
+
1885
+ bool any_gpus_with_slow_fp16 = false ;
1877
1886
1878
- bool any_pascal_with_slow_fp16 = false ;
1879
1887
if (split) {
1880
1888
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer ->buft ->context ;
1881
1889
auto & tensor_split = buft_ctx->tensor_split ;
@@ -1885,62 +1893,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1885
1893
continue ;
1886
1894
}
1887
1895
1888
- if (min_compute_capability > ggml_cuda_info ().devices [id].cc ) {
1889
- min_compute_capability = ggml_cuda_info ().devices [id].cc ;
1890
- }
1891
- if (ggml_cuda_info ().devices [id].cc == 610 ) {
1892
- any_pascal_with_slow_fp16 = true ;
1893
- }
1896
+ const int cc = ggml_cuda_info ().devices [id].cc ;
1897
+ use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
1898
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1899
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available (cc);
1894
1900
}
1895
1901
} else {
1896
- min_compute_capability = ggml_cuda_info ().devices [ctx.device ].cc ;
1897
- any_pascal_with_slow_fp16 = ggml_cuda_info ().devices [ctx.device ].cc == 610 ;
1898
- }
1899
-
1900
- // check data types and tensor shapes for custom matrix multiplication kernels:
1901
- bool use_dequantize_mul_mat_vec = (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16)
1902
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1903
- && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 && src1->ne [1 ] == 1 ;
1904
-
1905
- bool use_mul_mat_vec_q = ggml_is_quantized (src0->type )
1906
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1907
- && src1->ne [1 ] <= MMVQ_MAX_BATCH_SIZE;
1908
-
1909
- bool use_mul_mat_q = ggml_cuda_supports_mmq (src0->type )
1910
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1911
-
1912
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1913
-
1914
- const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
1915
-
1916
- if (!g_mul_mat_q)
1917
- {
1918
- use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
1919
- }
1920
-
1921
- #else
1922
-
1923
- // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
1924
- const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
1925
-
1926
- // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
1927
- use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
1928
- use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
1929
-
1930
- if (!g_mul_mat_q)
1931
- {
1932
- use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne [1 ] <= MMQ_MAX_BATCH_SIZE);
1902
+ const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
1903
+ use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
1904
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1905
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available (cc);
1933
1906
}
1934
1907
1935
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1936
-
1937
1908
// if mmvq is available it's a better choice than dmmv:
1938
1909
#ifndef GGML_CUDA_FORCE_DMMV
1939
1910
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1940
1911
#endif // GGML_CUDA_FORCE_DMMV
1941
1912
1942
- const bool use_tensor_cores = fp16_performance_good && !g_mul_mat_q;
1943
-
1944
1913
// debug helpers
1945
1914
// printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
1946
1915
// printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -1949,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1949
1918
// printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1950
1919
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1951
1920
1952
- if (!split && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
1953
- // KQ single-batch
1921
+ if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
1922
+ // FP32 precision KQ single-batch for batch size 1 without FlashAttention
1954
1923
ggml_cuda_mul_mat_vec_p021 (ctx, src0, src1, dst);
1955
- } else if (!split && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
1956
- // KQV single-batch
1924
+ } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
1925
+ // FP32 precision KQV single-batch for batch size 1 without FlashAttention
1957
1926
ggml_cuda_mul_mat_vec_nc (ctx, src0, src1, dst);
1958
- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || use_tensor_cores) && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1959
- // KQ + KQV multi-batch
1927
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
1928
+ && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1929
+ // KQ + KQV multi-batch without FlashAttention
1960
1930
ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
1961
1931
} else if (use_dequantize_mul_mat_vec) {
1962
1932
ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr );
0 commit comments