@@ -3418,23 +3418,20 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3418
3418
const int mmq_x = MMQ_X_Q4_0_AMPERE;
3419
3419
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3420
3420
const int nwarps = NWARPS_Q4_0_AMPERE;
3421
-
3422
- mul_mat_q<QK4_0, QR4_0, QI4_0, true , block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3423
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3424
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3425
-
3426
3421
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3427
3422
const int mmq_x = MMQ_X_Q4_0_PASCAL;
3428
3423
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3429
3424
const int nwarps = NWARPS_Q4_0_PASCAL;
3430
-
3431
- mul_mat_q<QK4_0, QR4_0, QI4_0, true , block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3432
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3433
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3434
3425
#else
3435
- (void ) vec_dot_q4_0_q8_1_mul_mat;
3426
+ const int mmq_x = -1 ;
3427
+ const int mmq_y = -1 ;
3428
+ const int nwarps = -1 ;
3436
3429
assert (false );
3437
3430
#endif // __CUDA_ARCH__ >= CC_TURING
3431
+
3432
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true , block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3433
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3434
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3438
3435
}
3439
3436
3440
3437
#define MMQ_X_Q4_1_AMPERE 64
@@ -3457,23 +3454,20 @@ template <bool need_check> static __global__ void
3457
3454
const int mmq_x = MMQ_X_Q4_1_AMPERE;
3458
3455
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3459
3456
const int nwarps = NWARPS_Q4_1_AMPERE;
3460
-
3461
- mul_mat_q<QK4_1, QR4_1, QI4_1, true , block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3462
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3463
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3464
-
3465
3457
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3466
3458
const int mmq_x = MMQ_X_Q4_1_PASCAL;
3467
3459
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3468
3460
const int nwarps = NWARPS_Q4_1_PASCAL;
3469
-
3470
- mul_mat_q<QK4_1, QR4_1, QI4_1, true , block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3471
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3472
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3473
3461
#else
3474
- (void ) vec_dot_q4_1_q8_1_mul_mat;
3462
+ const int mmq_x = -1 ;
3463
+ const int mmq_y = -1 ;
3464
+ const int nwarps = -1 ;
3475
3465
assert (false );
3476
3466
#endif // __CUDA_ARCH__ >= CC_TURING
3467
+
3468
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true , block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3469
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3470
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3477
3471
}
3478
3472
3479
3473
#define MMQ_X_Q5_0_AMPERE 128
@@ -3492,23 +3486,20 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3492
3486
const int mmq_x = MMQ_X_Q5_0_AMPERE;
3493
3487
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3494
3488
const int nwarps = NWARPS_Q5_0_AMPERE;
3495
-
3496
- mul_mat_q<QK5_0, QR5_0, QI5_0, false , block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3497
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3498
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3499
-
3500
3489
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3501
3490
const int mmq_x = MMQ_X_Q5_0_PASCAL;
3502
3491
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3503
3492
const int nwarps = NWARPS_Q5_0_PASCAL;
3504
-
3505
- mul_mat_q<QK5_0, QR5_0, QI5_0, false , block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3506
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3507
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3508
3493
#else
3509
- (void ) vec_dot_q5_0_q8_1_mul_mat;
3494
+ const int mmq_x = -1 ;
3495
+ const int mmq_y = -1 ;
3496
+ const int nwarps = -1 ;
3510
3497
assert (false );
3511
3498
#endif // __CUDA_ARCH__ >= CC_TURING
3499
+
3500
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false , block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3501
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3502
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3512
3503
}
3513
3504
3514
3505
#define MMQ_X_Q5_1_AMPERE 128
@@ -3527,23 +3518,20 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3527
3518
const int mmq_x = MMQ_X_Q5_1_AMPERE;
3528
3519
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3529
3520
const int nwarps = NWARPS_Q5_1_AMPERE;
3530
-
3531
- mul_mat_q<QK5_1, QR5_1, QI5_1, true , block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3532
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3533
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3534
-
3535
3521
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3536
3522
const int mmq_x = MMQ_X_Q5_1_PASCAL;
3537
3523
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3538
3524
const int nwarps = NWARPS_Q5_1_PASCAL;
3539
-
3540
- mul_mat_q<QK5_1, QR5_1, QI5_1, true , block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3541
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3542
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3543
3525
#else
3544
- (void ) vec_dot_q5_1_q8_1_mul_mat;
3526
+ const int mmq_x = -1 ;
3527
+ const int mmq_y = -1 ;
3528
+ const int nwarps = -1 ;
3545
3529
assert (false );
3546
3530
#endif // __CUDA_ARCH__ >= CC_TURING
3531
+
3532
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true , block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3533
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3534
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3547
3535
}
3548
3536
3549
3537
#define MMQ_X_Q8_0_AMPERE 128
@@ -3562,23 +3550,20 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3562
3550
const int mmq_x = MMQ_X_Q8_0_AMPERE;
3563
3551
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3564
3552
const int nwarps = NWARPS_Q8_0_AMPERE;
3565
-
3566
- mul_mat_q<QK8_0, QR8_0, QI8_0, false , block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3567
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3568
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3569
-
3570
3553
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3571
3554
const int mmq_x = MMQ_X_Q8_0_PASCAL;
3572
3555
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3573
3556
const int nwarps = NWARPS_Q8_0_PASCAL;
3574
-
3575
- mul_mat_q<QK8_0, QR8_0, QI8_0, false , block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3576
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3577
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3578
3557
#else
3579
- (void ) vec_dot_q8_0_q8_1_mul_mat;
3558
+ const int mmq_x = -1 ;
3559
+ const int mmq_y = -1 ;
3560
+ const int nwarps = -1 ;
3580
3561
assert (false );
3581
3562
#endif // __CUDA_ARCH__ >= CC_TURING
3563
+
3564
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false , block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3565
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3566
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3582
3567
}
3583
3568
3584
3569
#define MMQ_X_Q2_K_AMPERE 64
@@ -3597,23 +3582,20 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3597
3582
const int mmq_x = MMQ_X_Q2_K_AMPERE;
3598
3583
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3599
3584
const int nwarps = NWARPS_Q2_K_AMPERE;
3600
-
3601
- mul_mat_q<QK_K, QR2_K, QI2_K, false , block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3602
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3603
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3604
-
3605
3585
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3606
3586
const int mmq_x = MMQ_X_Q2_K_PASCAL;
3607
3587
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3608
3588
const int nwarps = NWARPS_Q2_K_PASCAL;
3609
-
3610
- mul_mat_q<QK_K, QR2_K, QI2_K, false , block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3611
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3612
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3613
3589
#else
3614
- (void ) vec_dot_q2_K_q8_1_mul_mat;
3590
+ const int mmq_x = -1 ;
3591
+ const int mmq_y = -1 ;
3592
+ const int nwarps = -1 ;
3615
3593
assert (false );
3616
3594
#endif // __CUDA_ARCH__ >= CC_TURING
3595
+
3596
+ mul_mat_q<QK_K, QR2_K, QI2_K, false , block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3597
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3598
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3617
3599
}
3618
3600
3619
3601
#define MMQ_X_Q3_K_AMPERE 128
@@ -3636,23 +3618,20 @@ template <bool need_check> static __global__ void
3636
3618
const int mmq_x = MMQ_X_Q3_K_AMPERE;
3637
3619
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3638
3620
const int nwarps = NWARPS_Q3_K_AMPERE;
3639
-
3640
- mul_mat_q<QK_K, QR3_K, QI3_K, false , block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3641
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3642
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3643
-
3644
3621
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3645
3622
const int mmq_x = MMQ_X_Q3_K_PASCAL;
3646
3623
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3647
3624
const int nwarps = NWARPS_Q3_K_PASCAL;
3648
-
3649
- mul_mat_q<QK_K, QR3_K, QI3_K, false , block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3650
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3651
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3652
3625
#else
3653
- (void ) vec_dot_q3_K_q8_1_mul_mat;
3626
+ const int mmq_x = -1 ;
3627
+ const int mmq_y = -1 ;
3628
+ const int nwarps = -1 ;
3654
3629
assert (false );
3655
3630
#endif // __CUDA_ARCH__ >= CC_TURING
3631
+
3632
+ mul_mat_q<QK_K, QR3_K, QI3_K, false , block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3633
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3634
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3656
3635
}
3657
3636
3658
3637
#define MMQ_X_Q4_K_AMPERE 64
@@ -3675,23 +3654,20 @@ template <bool need_check> static __global__ void
3675
3654
const int mmq_x = MMQ_X_Q4_K_AMPERE;
3676
3655
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3677
3656
const int nwarps = NWARPS_Q4_K_AMPERE;
3678
-
3679
- mul_mat_q<QK_K, QR4_K, QI4_K, true , block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3680
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3681
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3682
-
3683
3657
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3684
3658
const int mmq_x = MMQ_X_Q4_K_PASCAL;
3685
3659
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3686
3660
const int nwarps = NWARPS_Q4_K_PASCAL;
3687
-
3688
- mul_mat_q<QK_K, QR4_K, QI4_K, true , block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3689
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3690
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3691
3661
#else
3692
- (void ) vec_dot_q4_K_q8_1_mul_mat;
3662
+ const int mmq_x = -1 ;
3663
+ const int mmq_y = -1 ;
3664
+ const int nwarps = -1 ;
3693
3665
assert (false );
3694
3666
#endif // __CUDA_ARCH__ >= CC_TURING
3667
+
3668
+ mul_mat_q<QK_K, QR4_K, QI4_K, true , block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3669
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3670
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3695
3671
}
3696
3672
3697
3673
#define MMQ_X_Q5_K_AMPERE 64
@@ -3710,23 +3686,20 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3710
3686
const int mmq_x = MMQ_X_Q5_K_AMPERE;
3711
3687
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3712
3688
const int nwarps = NWARPS_Q5_K_AMPERE;
3713
-
3714
- mul_mat_q<QK_K, QR5_K, QI5_K, true , block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3715
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3716
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3717
-
3718
3689
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3719
3690
const int mmq_x = MMQ_X_Q5_K_PASCAL;
3720
3691
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3721
3692
const int nwarps = NWARPS_Q5_K_PASCAL;
3722
-
3723
- mul_mat_q<QK_K, QR5_K, QI5_K, true , block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3724
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3725
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3726
3693
#else
3727
- (void ) vec_dot_q5_K_q8_1_mul_mat;
3694
+ const int mmq_x = -1 ;
3695
+ const int mmq_y = -1 ;
3696
+ const int nwarps = -1 ;
3728
3697
assert (false );
3729
3698
#endif // __CUDA_ARCH__ >= CC_TURING
3699
+
3700
+ mul_mat_q<QK_K, QR5_K, QI5_K, true , block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3701
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3702
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3730
3703
}
3731
3704
3732
3705
#define MMQ_X_Q6_K_AMPERE 64
@@ -3749,23 +3722,20 @@ template <bool need_check> static __global__ void
3749
3722
const int mmq_x = MMQ_X_Q6_K_AMPERE;
3750
3723
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3751
3724
const int nwarps = NWARPS_Q6_K_AMPERE;
3752
-
3753
- mul_mat_q<QK_K, QR6_K, QI6_K, false , block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3754
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3755
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3756
-
3757
3725
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
3758
3726
const int mmq_x = MMQ_X_Q6_K_PASCAL;
3759
3727
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3760
3728
const int nwarps = NWARPS_Q6_K_PASCAL;
3761
-
3762
- mul_mat_q<QK_K, QR6_K, QI6_K, false , block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3763
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3764
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3765
3729
#else
3766
- (void ) vec_dot_q6_K_q8_1_mul_mat;
3730
+ const int mmq_x = -1 ;
3731
+ const int mmq_y = -1 ;
3732
+ const int nwarps = -1 ;
3767
3733
assert (false );
3768
3734
#endif // __CUDA_ARCH__ >= CC_TURING
3735
+
3736
+ mul_mat_q<QK_K, QR6_K, QI6_K, false , block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3737
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3738
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
3769
3739
}
3770
3740
3771
3741
template <int qk, int qi, typename block_q_t , int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
0 commit comments