@@ -60,6 +60,16 @@ typedef float dfloat; // dequantize float
60
60
typedef float2 dfloat2;
61
61
#endif // GGML_CUDA_DMMV_F16
62
62
63
+ static __device__ __forceinline__ int get_int_from_int8 (const int8_t * x8, const int & i32 ) {
64
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
65
+
66
+ int x32 = 0 ;
67
+ x32 |= x16[0 ] << 0 ;
68
+ x32 |= x16[1 ] << 16 ;
69
+
70
+ return x32;
71
+ }
72
+
63
73
static __device__ __forceinline__ int get_int_from_uint8 (const uint8_t * x8, const int & i32 ) {
64
74
const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
65
75
@@ -1602,27 +1612,30 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1602
1612
y_qs[j * (2 *WARP_SIZE) + kyqs + (QI8_1/2 )], x_dm[index_bx], y_ds[j * (2 *WARP_SIZE/QI8_1) + 2 *k/QI8_1]);
1603
1613
}
1604
1614
1605
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (
1606
- const void * __restrict__ vbq , const block_q8_1 * __restrict__ bq8_1 , const int & iqs ) {
1615
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl (
1616
+ const int & vi , const int & ui, const half & d8_0 , const half2 & ds8_1 ) {
1607
1617
1608
1618
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1609
- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1610
-
1611
- int vi;
1612
- memcpy (&vi, &bq8_0->qs [sizeof (int ) * (iqs + 0 )], sizeof (int ));
1613
- const int ui = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + 0 )]);
1614
-
1615
- const float d = __half2float (bq8_0->d ) * __half2float (bq8_1->ds .x );
1616
-
1617
1619
// SIMD dot product of quantized values
1618
- int sumi = __dp4a (vi, ui, 0 );
1620
+ const int sumi = __dp4a (vi, ui, 0 );
1619
1621
1620
- return sumi*d ;
1622
+ return sumi * __half2float (d8_0) * __half2float (ds8_1. x ) ;
1621
1623
#else
1622
1624
return 0 .0f ; // only to satisfy the compiler
1623
1625
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1624
1626
}
1625
1627
1628
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (
1629
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1630
+
1631
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1632
+
1633
+ const int vi = get_int_from_int8 (bq8_0->qs , iqs);
1634
+ const int ui = get_int_from_int8_aligned (bq8_1->qs , iqs);
1635
+
1636
+ return vec_dot_q8_0_q8_1_impl (vi, ui, bq8_0->d , bq8_1->ds );
1637
+ }
1638
+
1626
1639
static __device__ __forceinline__ float vec_dot_q2_K_q8_1 (
1627
1640
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1628
1641
0 commit comments