@@ -5554,6 +5554,40 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
5554
5554
cpy_1 (cx + x_offset, cdst + dst_offset);
5555
5555
}
5556
5556
5557
+ static __device__ void cpy_blck_f16_q8_0 (const char * cxi, char * cdsti) {
5558
+ const half * xi = (const half *) cxi;
5559
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
5560
+
5561
+ half amax = 0.0 ; // absolute max
5562
+
5563
+ for (int j = 0 ; j < QK8_0; j++) {
5564
+ const half v = xi[j];
5565
+ amax = __hmax (amax, __habs (v));
5566
+ }
5567
+
5568
+ const half d = amax / (half)((1 << 7 ) - 1 );
5569
+ const half id = d ? ((half)1.0 )/d : (half)0.0 ;
5570
+
5571
+ dsti->d = d;
5572
+
5573
+ for (int j = 0 ; j < QK8_0; ++j) {
5574
+ const half x0 = xi[j]*id;
5575
+
5576
+ dsti->qs [j] = __half2int_rz (x0);
5577
+ }
5578
+ }
5579
+
5580
+ static __device__ void cpy_blck_q8_0_f16 (const char * cxi, char * cdsti) {
5581
+ const block_q8_0 * xi = (const block_q8_0 *) cxi;
5582
+ half * dsti = (half *) cdsti;
5583
+
5584
+ const half d = xi->d ;
5585
+
5586
+ for (int j = 0 ; j < QK8_0; j++) {
5587
+ dsti[j] = (half)xi->qs [j] * d;
5588
+ }
5589
+ }
5590
+
5557
5591
static __device__ void cpy_blck_f32_q8_0 (const char * cxi, char * cdsti) {
5558
5592
const float * xi = (const float *) cxi;
5559
5593
block_q8_0 * dsti = (block_q8_0 *) cdsti;
@@ -5573,7 +5607,7 @@ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
5573
5607
for (int j = 0 ; j < QK8_0; ++j) {
5574
5608
const float x0 = xi[j]*id;
5575
5609
5576
- dsti->qs [j] = roundf (x0);
5610
+ dsti->qs [j] = __half2int_rz (x0);
5577
5611
}
5578
5612
}
5579
5613
@@ -5667,6 +5701,32 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
5667
5701
cpy_blck (cx + x_offset, cdst + dst_offset);
5668
5702
}
5669
5703
5704
+ template <cpy_kernel_t cpy_blck, int qk>
5705
+ static __global__ void cpy_q_f32 (const char * cx, char * cdst, const int ne,
5706
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
5707
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
5708
+ const int nb12, const int nb13) {
5709
+ const int i = (blockDim .x *blockIdx .x + threadIdx .x )*qk;
5710
+
5711
+ if (i >= ne) {
5712
+ return ;
5713
+ }
5714
+
5715
+ const int i03 = i/(ne00 * ne01 * ne02);
5716
+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
5717
+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
5718
+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
5719
+ const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
5720
+
5721
+ const int i13 = i/(ne10 * ne11 * ne12);
5722
+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
5723
+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
5724
+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
5725
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
5726
+
5727
+ cpy_blck (cx + x_offset, cdst + dst_offset);
5728
+ }
5729
+
5670
5730
static __device__ float rope_yarn_ramp (const float low, const float high, const int i0) {
5671
5731
const float y = (i0 / 2 - low) / max (0 .001f , high - low);
5672
5732
return 1 .0f - min (1 .0f , max (0 .0f , y));
@@ -7382,7 +7442,26 @@ static void ggml_cpy_f16_f16_cuda(
7382
7442
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7383
7443
}
7384
7444
7445
+ static void ggml_cpy_f16_q8_0_cuda (
7446
+ const char * cx, char * cdst, const int ne,
7447
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7448
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7449
+
7450
+ GGML_ASSERT (ne % QK8_0 == 0 );
7451
+ const int num_blocks = ne / QK8_0;
7452
+ cpy_f32_q<cpy_blck_f16_q8_0, QK8_0><<<num_blocks, 1 , 0 , stream>>>
7453
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7454
+ }
7385
7455
7456
+ static void ggml_cpy_q8_0_f16_cuda (
7457
+ const char * cx, char * cdst, const int ne,
7458
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7459
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7460
+
7461
+ const int num_blocks = ne;
7462
+ cpy_q_f32<cpy_blck_q8_0_f16, QK8_0><<<num_blocks, 1 , 0 , stream>>>
7463
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7464
+ }
7386
7465
7387
7466
static void scale_f32_cuda (const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
7388
7467
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1 ) / CUDA_SCALE_BLOCK_SIZE;
@@ -10373,6 +10452,10 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
10373
10452
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10374
10453
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
10375
10454
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10455
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_Q8_0) {
10456
+ ggml_cpy_f16_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10457
+ } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F16) {
10458
+ ggml_cpy_q8_0_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10376
10459
} else {
10377
10460
fprintf (stderr, " %s: unsupported type combination (%s to %s)\n " , __func__,
10378
10461
ggml_type_name (src0->type ), ggml_type_name (src1->type ));
0 commit comments