@@ -294,8 +294,9 @@ static ggml_cuda_device_info ggml_cuda_init() {
294
294
295
295
ggml_cuda_device_info info = {};
296
296
297
- if (cudaGetDeviceCount (&info.device_count ) != cudaSuccess) {
298
- fprintf (stderr, " %s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n " , __func__);
297
+ cudaError_t err = cudaGetDeviceCount (&info.device_count );
298
+ if (err != cudaSuccess) {
299
+ fprintf (stderr, " %s: failed to initialize " GGML_CUDA_NAME " : %s\n " , __func__, cudaGetErrorString (err));
299
300
return info;
300
301
}
301
302
@@ -369,12 +370,6 @@ struct ggml_cuda_pool {
369
370
370
371
virtual void * alloc (size_t size, size_t * actual_size) = 0;
371
372
virtual void free (void * ptr, size_t size) = 0;
372
-
373
- ggml_cuda_pool () = default ;
374
- ggml_cuda_pool (const ggml_cuda_pool &) = delete ;
375
- ggml_cuda_pool (ggml_cuda_pool &&) = delete ;
376
- ggml_cuda_pool& operator =(const ggml_cuda_pool &) = delete ;
377
- ggml_cuda_pool& operator =(ggml_cuda_pool &&) = delete ;
378
373
};
379
374
380
375
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -6762,6 +6757,123 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
6762
6757
}
6763
6758
}
6764
6759
6760
+ static __device__ void cpy_blck_f32_q5_0 (const char * cxi, char * cdsti) {
6761
+ const float * xi = (const float *) cxi;
6762
+ block_q5_0 * dsti = (block_q5_0 *) cdsti;
6763
+
6764
+ float amax = 0 .0f ;
6765
+ float vmax = 0 .0f ;
6766
+
6767
+ for (int j = 0 ; j < QK5_0; ++j) {
6768
+ const float v = xi[j];
6769
+ if (amax < fabsf (v)) {
6770
+ amax = fabsf (v);
6771
+ vmax = v;
6772
+ }
6773
+ }
6774
+
6775
+ const float d = vmax / -16 ;
6776
+ const float id = d ? 1 .0f /d : 0 .0f ;
6777
+
6778
+ dsti->d = d;
6779
+
6780
+ uint32_t qh = 0 ;
6781
+ for (int j = 0 ; j < QK5_0/2 ; ++j) {
6782
+ const float x0 = xi[0 + j]*id;
6783
+ const float x1 = xi[QK5_0/2 + j]*id;
6784
+
6785
+ const uint8_t xi0 = min (31 , (int8_t )(x0 + 16 .5f ));
6786
+ const uint8_t xi1 = min (31 , (int8_t )(x1 + 16 .5f ));
6787
+
6788
+ dsti->qs [j] = (xi0 & 0xf ) | ((xi1 & 0xf ) << 4 );
6789
+ qh |= ((xi0 & 0x10u ) >> 4 ) << (j + 0 );
6790
+ qh |= ((xi1 & 0x10u ) >> 4 ) << (j + QK5_0/2 );
6791
+ }
6792
+ memcpy (dsti->qh , &qh, sizeof (qh));
6793
+ }
6794
+
6795
+ static __device__ void cpy_blck_f32_q5_1 (const char * cxi, char * cdsti) {
6796
+ const float * xi = (const float *) cxi;
6797
+ block_q5_1 * dsti = (block_q5_1 *) cdsti;
6798
+
6799
+ float min = xi[0 ];
6800
+ float max = xi[0 ];
6801
+
6802
+ for (int j = 1 ; j < QK5_1; ++j) {
6803
+ const float v = xi[j];
6804
+ min = v < min ? v : min;
6805
+ max = v > max ? v : max;
6806
+ }
6807
+
6808
+ const float d = (max - min) / 31 ;
6809
+ const float id = d ? 1 .0f /d : 0 .0f ;
6810
+
6811
+ dsti->dm .x = d;
6812
+ dsti->dm .y = min;
6813
+
6814
+ uint32_t qh = 0 ;
6815
+ for (int j = 0 ; j < QK5_1/2 ; ++j) {
6816
+ const float x0 = (xi[0 + j] - min)*id;
6817
+ const float x1 = (xi[QK5_1/2 + j] - min)*id;
6818
+
6819
+ const uint8_t xi0 = (uint8_t )(x0 + 0 .5f );
6820
+ const uint8_t xi1 = (uint8_t )(x1 + 0 .5f );
6821
+
6822
+ dsti->qs [j] = (xi0 & 0xf ) | ((xi1 & 0xf ) << 4 );
6823
+ qh |= ((xi0 & 0x10u ) >> 4 ) << (j + 0 );
6824
+ qh |= ((xi1 & 0x10u ) >> 4 ) << (j + QK5_1/2 );
6825
+ }
6826
+ memcpy (dsti->qh , &qh, sizeof (qh));
6827
+ }
6828
+
6829
+ static __device__ __forceinline__ int best_index_int8 (int n, const int8_t * val, float x) {
6830
+ if (x <= val[0 ]) return 0 ;
6831
+ if (x >= val[n-1 ]) return n-1 ;
6832
+ int ml = 0 , mu = n-1 ;
6833
+ while (mu-ml > 1 ) {
6834
+ int mav = (ml+mu)/2 ;
6835
+ if (x < val[mav]) mu = mav; else ml = mav;
6836
+ }
6837
+ return x - val[mu-1 ] < val[mu] - x ? mu-1 : mu;
6838
+ }
6839
+
6840
+ static __device__ void cpy_blck_f32_iq4_nl (const char * cxi, char * cdsti) {
6841
+ const float * xi = (const float *) cxi;
6842
+ block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
6843
+
6844
+ float amax = 0 .0f ;
6845
+ float vmax = 0 .0f ;
6846
+
6847
+ for (int j = 0 ; j < QK4_NL; ++j) {
6848
+ const float v = xi[j];
6849
+ if (amax < fabsf (v)) {
6850
+ amax = fabsf (v);
6851
+ vmax = v;
6852
+ }
6853
+ }
6854
+
6855
+ float d = vmax / kvalues_iq4nl[0 ];
6856
+ const float id = d ? 1 .0f /d : 0 .0f ;
6857
+
6858
+ float sumqx = 0 , sumq2 = 0 ;
6859
+ for (int j = 0 ; j < QK4_NL/2 ; ++j) {
6860
+ const float x0 = xi[0 + j]*id;
6861
+ const float x1 = xi[QK4_NL/2 + j]*id;
6862
+ const uint8_t xi0 = best_index_int8 (16 , kvalues_iq4nl, x0);
6863
+ const uint8_t xi1 = best_index_int8 (16 , kvalues_iq4nl, x1);
6864
+ dsti->qs [j] = xi0 | (xi1 << 4 );
6865
+ const float v0 = kvalues_iq4nl[xi0];
6866
+ const float v1 = kvalues_iq4nl[xi1];
6867
+ const float w0 = xi[0 + j]*xi[0 + j];
6868
+ const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
6869
+ sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
6870
+ sumq2 += w0*v0*v0 + w1*v1*v1;
6871
+ }
6872
+
6873
+ dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
6874
+ }
6875
+
6876
+
6765
6877
template <cpy_kernel_t cpy_blck, int qk>
6766
6878
static __global__ void cpy_f32_q (const char * cx, char * cdst, const int ne,
6767
6879
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -6968,7 +7080,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
6968
7080
}
6969
7081
6970
7082
template <typename T>
6971
- static inline __device__ void swap (T & a, T & b) {
7083
+ static inline __device__ void ggml_cuda_swap (T & a, T & b) {
6972
7084
T tmp = a;
6973
7085
a = b;
6974
7086
b = tmp;
@@ -6997,11 +7109,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
6997
7109
if (ixj > col) {
6998
7110
if ((col & k) == 0 ) {
6999
7111
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
7000
- swap (dst_row[col], dst_row[ixj]);
7112
+ ggml_cuda_swap (dst_row[col], dst_row[ixj]);
7001
7113
}
7002
7114
} else {
7003
7115
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
7004
- swap (dst_row[col], dst_row[ixj]);
7116
+ ggml_cuda_swap (dst_row[col], dst_row[ixj]);
7005
7117
}
7006
7118
}
7007
7119
}
@@ -8495,6 +8607,39 @@ static void ggml_cpy_f32_q4_1_cuda(
8495
8607
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8496
8608
}
8497
8609
8610
+ static void ggml_cpy_f32_q5_0_cuda (
8611
+ const char * cx, char * cdst, const int ne,
8612
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
8613
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
8614
+
8615
+ GGML_ASSERT (ne % QK5_0 == 0 );
8616
+ const int num_blocks = ne / QK5_0;
8617
+ cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1 , 0 , stream>>>
8618
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8619
+ }
8620
+
8621
+ static void ggml_cpy_f32_q5_1_cuda (
8622
+ const char * cx, char * cdst, const int ne,
8623
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
8624
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
8625
+
8626
+ GGML_ASSERT (ne % QK5_1 == 0 );
8627
+ const int num_blocks = ne / QK5_1;
8628
+ cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1 , 0 , stream>>>
8629
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8630
+ }
8631
+
8632
+ static void ggml_cpy_f32_iq4_nl_cuda (
8633
+ const char * cx, char * cdst, const int ne,
8634
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
8635
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
8636
+
8637
+ GGML_ASSERT (ne % QK4_NL == 0 );
8638
+ const int num_blocks = ne / QK4_NL;
8639
+ cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1 , 0 , stream>>>
8640
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8641
+ }
8642
+
8498
8643
static void ggml_cpy_f16_f16_cuda (
8499
8644
const char * cx, char * cdst, const int ne,
8500
8645
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -10893,6 +11038,12 @@ static void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * s
10893
11038
ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10894
11039
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
10895
11040
ggml_cpy_f32_q4_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
11041
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
11042
+ ggml_cpy_f32_q5_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
11043
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
11044
+ ggml_cpy_f32_iq4_nl_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
11045
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
11046
+ ggml_cpy_f32_q5_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10896
11047
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
10897
11048
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10898
11049
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
@@ -11309,6 +11460,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11309
11460
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
11310
11461
return true ;
11311
11462
}
11463
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
11464
+ return true ;
11465
+ }
11466
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
11467
+ return true ;
11468
+ }
11469
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
11470
+ return true ;
11471
+ }
11312
11472
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
11313
11473
return true ;
11314
11474
}
0 commit comments