@@ -60,6 +60,24 @@ typedef float dfloat; // dequantize float
60
60
typedef float2 dfloat2;
61
61
#endif // GGML_CUDA_DMMV_F16
62
62
63
+ static __device__ __forceinline__ int get_int_from_uint8 (const uint8_t * x8, const int & i32 ) {
64
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
65
+
66
+ int x32 = 0 ;
67
+ x32 |= x16[0 ] << 0 ;
68
+ x32 |= x16[1 ] << 16 ;
69
+
70
+ return x32;
71
+ }
72
+
73
+ static __device__ __forceinline__ int get_int_from_int8_aligned (const int8_t * x8, const int & i32 ) {
74
+ return *((int *) (x8 + sizeof (int ) * i32 )); // assume at least 4 byte alignment
75
+ }
76
+
77
+ static __device__ __forceinline__ int get_int_from_uint8_aligned (const uint8_t * x8, const int & i32 ) {
78
+ return *((int *) (x8 + sizeof (int ) * i32 )); // assume at least 4 byte alignment
79
+ }
80
+
63
81
typedef void (*dequantize_kernel_t )(const void * vx, const int ib, const int iqs, dfloat2 & v);
64
82
typedef void (*to_fp32_cuda_t )(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
65
83
typedef void (*dot_kernel_k_t )(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
@@ -1315,10 +1333,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1315
1333
1316
1334
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1317
1335
1318
- int vi;
1319
- memcpy (&vi, &bq4_0->qs [sizeof (int ) * (iqs + 0 )], sizeof (int ));
1320
- const int ui0 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + 0 )]);
1321
- const int ui1 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + QI4_0)]);
1336
+ const int vi = get_int_from_uint8 (bq4_0->qs , iqs);
1337
+ const int ui0 = get_int_from_int8_aligned (bq8_1->qs , iqs);
1338
+ const int ui1 = get_int_from_int8_aligned (bq8_1->qs , iqs + QI4_0);
1322
1339
1323
1340
return vec_dot_q4_0_q8_1_impl (vi, ui0, ui1, bq4_0->d , bq8_1->ds );
1324
1341
}
@@ -1337,11 +1354,11 @@ static __device__ __forceinline__ void load_tiles_q4_0(
1337
1354
int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) {
1338
1355
1339
1356
const int kbx = k / QI4_0;
1340
- const int kqsx = sizeof ( int ) * ( k % QI4_0) ;
1357
+ const int kqsx = k % QI4_0;
1341
1358
1342
1359
const block_q4_0 * bx = ((block_q4_0 *) vx) + i*blocks_per_row + kbx;
1343
1360
1344
- memcpy (& x_ql[i * WARP_SIZE + i + k], & bx->qs [kqsx], sizeof ( int ) );
1361
+ x_ql[i * ( WARP_SIZE + 1 ) + k] = get_int_from_uint8 ( bx->qs , kqsx );
1345
1362
x_dm[i * (WARP_SIZE / QI4_0) + kbx].x = bx->d ;
1346
1363
}
1347
1364
@@ -1388,9 +1405,9 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1388
1405
1389
1406
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1390
1407
1391
- const int vi = *(( int *) & bq4_1->qs [ sizeof ( int ) * ( iqs + 0 )] );
1392
- const int ui0 = *(( int *) & bq8_1->qs [ sizeof ( int ) * ( iqs + 0 )] );
1393
- const int ui1 = *(( int *) & bq8_1->qs [ sizeof ( int ) * ( iqs + QI4_1)] );
1408
+ const int vi = get_int_from_uint8_aligned ( bq4_1->qs , iqs);
1409
+ const int ui0 = get_int_from_int8_aligned ( bq8_1->qs , iqs);
1410
+ const int ui1 = get_int_from_int8_aligned ( bq8_1->qs , iqs + QI4_1);
1394
1411
1395
1412
return vec_dot_q4_1_q8_1_impl (vi, ui0, ui1, bq4_1->dm , bq8_1->ds );
1396
1413
}
@@ -1409,11 +1426,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
1409
1426
int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) {
1410
1427
1411
1428
const int kbx = k / QI4_1;
1412
- const int kqsx = sizeof ( int ) * ( k % QI4_1) ;
1429
+ const int kqsx = k % QI4_1;
1413
1430
1414
1431
const block_q4_1 * bx = ((block_q4_1 *) vx) + i*blocks_per_row + kbx;
1415
1432
1416
- x_ql[i * WARP_SIZE + i + k] = *(( int *) & bx->qs [ kqsx] );
1433
+ x_ql[i * ( WARP_SIZE + 1 ) + k] = get_int_from_uint8_aligned ( bx->qs , kqsx);
1417
1434
x_dm[i * (WARP_SIZE / QI4_1) + kbx] = bx->dm ;
1418
1435
}
1419
1436
@@ -1433,18 +1450,18 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
1433
1450
1434
1451
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1435
1452
int vi0 = (qs >> 0 ) & 0x0F0F0F0F ; // lower 4 qs bits, still need qh0 as 5th bits
1436
- vi0 |= (qh0 << 4 ) & 0x00000010 ; // 1 -> 5
1437
- vi0 |= (qh0 << 11 ) & 0x00001000 ; // 2 -> 13
1438
- vi0 |= (qh0 << 18 ) & 0x00100000 ; // 3 -> 21
1439
- vi0 |= (qh0 << 25 ) & 0x10000000 ; // 4 -> 29
1453
+ vi0 |= (qh0 << 4 ) & 0x00000010 ; // 0 -> 4
1454
+ vi0 |= (qh0 << 11 ) & 0x00001000 ; // 1 -> 12
1455
+ vi0 |= (qh0 << 18 ) & 0x00100000 ; // 2 -> 20
1456
+ vi0 |= (qh0 << 25 ) & 0x10000000 ; // 3 -> 28
1440
1457
vi0 = __vsub4 (vi0, 0x10101010 ); // subtract 16 from quantized values
1441
1458
int sumi = __dp4a (vi0, ui0, 0 ); // SIMD dot product of quantized values
1442
1459
1443
1460
int vi1 = (qs >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh1 as 5th bits
1444
- vi1 |= (qh1 << 4 ) & 0x00000010 ; // 1 -> 5
1445
- vi1 |= (qh1 << 11 ) & 0x00001000 ; // 2 -> 13
1446
- vi1 |= (qh1 << 18 ) & 0x00100000 ; // 3 -> 21
1447
- vi1 |= (qh1 << 25 ) & 0x10000000 ; // 4 -> 29
1461
+ vi1 |= (qh1 << 4 ) & 0x00000010 ; // 0 -> 4
1462
+ vi1 |= (qh1 << 11 ) & 0x00001000 ; // 1 -> 12
1463
+ vi1 |= (qh1 << 18 ) & 0x00100000 ; // 2 -> 20
1464
+ vi1 |= (qh1 << 25 ) & 0x10000000 ; // 3 -> 28
1448
1465
vi1 = __vsub4 (vi1, 0x10101010 ); // subtract 16 from quantized values
1449
1466
sumi = __dp4a (vi1, ui1, sumi); // SIMD dot product of quantized values
1450
1467
@@ -1459,12 +1476,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1459
1476
1460
1477
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1461
1478
1462
- int qs;
1463
- memcpy (&qs, &bq5_0->qs [sizeof (int ) * (iqs + 0 )], sizeof (int ));
1479
+ const int qs = get_int_from_uint8 (bq5_0->qs , iqs);
1464
1480
const int qh0 = bq5_0->qh [iqs/2 + 0 ] >> 4 *(iqs%2 );
1465
1481
const int qh1 = bq5_0->qh [iqs/2 + 2 ] >> 4 *(iqs%2 );
1466
- const int ui0 = *(( int *) & bq8_1->qs [ sizeof ( int ) * ( iqs + 0 )] );
1467
- const int ui1 = *(( int *) & bq8_1->qs [ sizeof ( int ) * ( iqs + QI5_0)] );
1482
+ const int ui0 = get_int_from_int8_aligned ( bq8_1->qs , iqs);
1483
+ const int ui1 = get_int_from_int8_aligned ( bq8_1->qs , iqs + QI5_0);
1468
1484
1469
1485
return vec_dot_q5_0_q8_1_impl (qs, qh0, qh1, ui0, ui1, bq5_0->d , bq8_1->ds );
1470
1486
}
0 commit comments