13
13
#include < map>
14
14
#include < array>
15
15
16
+ // stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
17
+ #define STRINGIZE_IMPL (...) #__VA_ARGS__
18
+ #define STRINGIZE (...) STRINGIZE_IMPL(__VA_ARGS__)
19
+
16
20
#if defined(GGML_USE_HIPBLAS)
17
21
#include < hip/hip_runtime.h>
18
22
#include < hipblas/hipblas.h>
@@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
584
588
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
585
589
586
590
[[noreturn]]
587
- static __device__ void bad_arch () {
588
- printf (" ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n " );
591
+ static __device__ void no_device_code (
592
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
593
+
594
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
595
+ printf (" %s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n " ,
596
+ file_name, line, function_name, arch);
597
+ (void ) arch_list;
598
+ #else
599
+ printf (" %s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n " ,
600
+ file_name, line, function_name, arch, arch_list);
601
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
589
602
__trap ();
590
603
591
- (void ) bad_arch ; // suppress unused function warning
604
+ (void ) no_device_code ; // suppress unused function warning
592
605
}
593
606
607
+ #ifdef __CUDA_ARCH__
608
+ #define NO_DEVICE_CODE no_device_code (__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
609
+ #else
610
+ #define NO_DEVICE_CODE GGML_ASSERT (false && " NO_DEVICE_CODE not valid in host code." )
611
+ #endif // __CUDA_ARCH__
612
+
594
613
static __device__ __forceinline__ float warp_reduce_sum (float x) {
595
614
#pragma unroll
596
615
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
@@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
617
636
return a;
618
637
#else
619
638
(void ) a;
620
- bad_arch () ;
639
+ NO_DEVICE_CODE ;
621
640
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
622
641
}
623
642
@@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
638
657
return x;
639
658
#else
640
659
(void ) x;
641
- bad_arch () ;
660
+ NO_DEVICE_CODE ;
642
661
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
643
662
}
644
663
@@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
2421
2440
}
2422
2441
#else
2423
2442
(void ) vx; (void ) y; (void ) k;
2424
- bad_arch () ;
2443
+ NO_DEVICE_CODE ;
2425
2444
#endif // __CUDA_ARCH__ >= CC_PASCAL
2426
2445
}
2427
2446
@@ -2452,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
2452
2471
// second part effectively subtracts 8 from each quant value
2453
2472
return d4 * (sumi * ds8f.x - (8 *vdr/QI4_0) * ds8f.y );
2454
2473
#else
2455
- bad_arch () ;
2474
+ NO_DEVICE_CODE ;
2456
2475
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2457
2476
}
2458
2477
@@ -2489,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2489
2508
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2490
2509
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2491
2510
#else
2492
- bad_arch () ;
2511
+ NO_DEVICE_CODE ;
2493
2512
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2494
2513
}
2495
2514
@@ -2524,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2524
2543
// second part effectively subtracts 16 from each quant value
2525
2544
return d5 * (sumi * ds8f.x - (16 *vdr/QI5_0) * ds8f.y );
2526
2545
#else
2527
- bad_arch () ;
2546
+ NO_DEVICE_CODE ;
2528
2547
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2529
2548
}
2530
2549
@@ -2569,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2569
2588
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2570
2589
2571
2590
#else
2572
- bad_arch () ;
2591
+ NO_DEVICE_CODE ;
2573
2592
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2574
2593
}
2575
2594
@@ -2590,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2590
2609
2591
2610
return d8_0*d8_1 * sumi;
2592
2611
#else
2593
- bad_arch () ;
2612
+ NO_DEVICE_CODE ;
2594
2613
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2595
2614
}
2596
2615
@@ -2620,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2620
2639
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2621
2640
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2622
2641
#else
2623
- bad_arch () ;
2642
+ NO_DEVICE_CODE ;
2624
2643
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2625
2644
}
2626
2645
@@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2655
2674
2656
2675
return dm2f.x *sumf_d - dm2f.y *sumf_m;
2657
2676
#else
2658
- bad_arch () ;
2677
+ NO_DEVICE_CODE ;
2659
2678
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2660
2679
}
2661
2680
@@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2692
2711
2693
2712
return d8 * (dm2f.x *sumi_d - dm2f.y *sumi_m);
2694
2713
#else
2695
- bad_arch () ;
2714
+ NO_DEVICE_CODE ;
2696
2715
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2697
2716
}
2698
2717
@@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2732
2751
2733
2752
return d3 * sumf;
2734
2753
#else
2735
- bad_arch () ;
2754
+ NO_DEVICE_CODE ;
2736
2755
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2737
2756
}
2738
2757
@@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2757
2776
2758
2777
return d3*d8 * sumi;
2759
2778
#else
2760
- bad_arch () ;
2779
+ NO_DEVICE_CODE ;
2761
2780
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2762
2781
}
2763
2782
@@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2790
2809
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2791
2810
2792
2811
#else
2793
- bad_arch () ;
2812
+ NO_DEVICE_CODE ;
2794
2813
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2795
2814
}
2796
2815
@@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2823
2842
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2824
2843
2825
2844
#else
2826
- bad_arch () ;
2845
+ NO_DEVICE_CODE ;
2827
2846
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2828
2847
}
2829
2848
@@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2863
2882
return dm5f.x *sumf_d - dm5f.y *sumf_m;
2864
2883
2865
2884
#else
2866
- bad_arch () ;
2885
+ NO_DEVICE_CODE ;
2867
2886
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2868
2887
}
2869
2888
@@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2896
2915
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2897
2916
2898
2917
#else
2899
- bad_arch () ;
2918
+ NO_DEVICE_CODE ;
2900
2919
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2901
2920
}
2902
2921
@@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2926
2945
2927
2946
return d*sumf;
2928
2947
#else
2929
- bad_arch () ;
2948
+ NO_DEVICE_CODE ;
2930
2949
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2931
2950
}
2932
2951
@@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2957
2976
return d6 * sumf_d;
2958
2977
2959
2978
#else
2960
- bad_arch () ;
2979
+ NO_DEVICE_CODE ;
2961
2980
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2962
2981
}
2963
2982
@@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3823
3842
return dall * sumf_d - dmin * sumf_m;
3824
3843
3825
3844
#else
3826
- bad_arch () ;
3845
+ NO_DEVICE_CODE ;
3827
3846
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3828
3847
3829
3848
#endif
@@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
4006
4025
return d * sumf_d;
4007
4026
4008
4027
#else
4009
- bad_arch () ;
4028
+ NO_DEVICE_CODE ;
4010
4029
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
4011
4030
4012
4031
#endif
@@ -4501,7 +4520,7 @@ template <bool need_check> static __global__ void
4501
4520
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4502
4521
#else
4503
4522
(void ) vec_dot_q4_0_q8_1_mul_mat;
4504
- bad_arch () ;
4523
+ NO_DEVICE_CODE ;
4505
4524
#endif // __CUDA_ARCH__ >= CC_VOLTA
4506
4525
}
4507
4526
@@ -4570,7 +4589,7 @@ template <bool need_check> static __global__ void
4570
4589
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4571
4590
#else
4572
4591
(void ) vec_dot_q4_1_q8_1_mul_mat;
4573
- bad_arch () ;
4592
+ NO_DEVICE_CODE ;
4574
4593
#endif // __CUDA_ARCH__ >= CC_VOLTA
4575
4594
}
4576
4595
@@ -4637,7 +4656,7 @@ template <bool need_check> static __global__ void
4637
4656
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4638
4657
#else
4639
4658
(void ) vec_dot_q5_0_q8_1_mul_mat;
4640
- bad_arch () ;
4659
+ NO_DEVICE_CODE ;
4641
4660
#endif // __CUDA_ARCH__ >= CC_VOLTA
4642
4661
}
4643
4662
@@ -4704,7 +4723,7 @@ mul_mat_q5_1(
4704
4723
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4705
4724
#else
4706
4725
(void ) vec_dot_q5_1_q8_1_mul_mat;
4707
- bad_arch () ;
4726
+ NO_DEVICE_CODE ;
4708
4727
#endif // __CUDA_ARCH__ >= CC_VOLTA
4709
4728
}
4710
4729
@@ -4771,7 +4790,7 @@ template <bool need_check> static __global__ void
4771
4790
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4772
4791
#else
4773
4792
(void ) vec_dot_q8_0_q8_1_mul_mat;
4774
- bad_arch () ;
4793
+ NO_DEVICE_CODE ;
4775
4794
#endif // __CUDA_ARCH__ >= CC_VOLTA
4776
4795
}
4777
4796
@@ -4838,7 +4857,7 @@ mul_mat_q2_K(
4838
4857
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4839
4858
#else
4840
4859
(void ) vec_dot_q2_K_q8_1_mul_mat;
4841
- bad_arch () ;
4860
+ NO_DEVICE_CODE ;
4842
4861
#endif // __CUDA_ARCH__ >= CC_VOLTA
4843
4862
}
4844
4863
@@ -4907,7 +4926,7 @@ template <bool need_check> static __global__ void
4907
4926
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4908
4927
#else
4909
4928
(void ) vec_dot_q3_K_q8_1_mul_mat;
4910
- bad_arch () ;
4929
+ NO_DEVICE_CODE ;
4911
4930
#endif // __CUDA_ARCH__ >= CC_VOLTA
4912
4931
}
4913
4932
@@ -4976,7 +4995,7 @@ template <bool need_check> static __global__ void
4976
4995
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4977
4996
#else
4978
4997
(void ) vec_dot_q4_K_q8_1_mul_mat;
4979
- bad_arch () ;
4998
+ NO_DEVICE_CODE ;
4980
4999
#endif // __CUDA_ARCH__ >= CC_VOLTA
4981
5000
}
4982
5001
@@ -5043,7 +5062,7 @@ mul_mat_q5_K(
5043
5062
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5044
5063
#else
5045
5064
(void ) vec_dot_q5_K_q8_1_mul_mat;
5046
- bad_arch () ;
5065
+ NO_DEVICE_CODE ;
5047
5066
#endif // __CUDA_ARCH__ >= CC_VOLTA
5048
5067
}
5049
5068
@@ -5112,7 +5131,7 @@ template <bool need_check> static __global__ void
5112
5131
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5113
5132
#else
5114
5133
(void ) vec_dot_q6_K_q8_1_mul_mat;
5115
- bad_arch () ;
5134
+ NO_DEVICE_CODE ;
5116
5135
#endif // __CUDA_ARCH__ >= CC_VOLTA
5117
5136
}
5118
5137
@@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5835
5854
}
5836
5855
#else
5837
5856
(void ) x; (void ) y; (void ) dst; (void ) ncols_par; (void ) nrows_y; (void ) scale;
5838
- bad_arch () ;
5857
+ NO_DEVICE_CODE ;
5839
5858
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5840
5859
}
5841
5860
0 commit comments