116
116
#include " ggml.h"
117
117
#include " ggml-backend-impl.h"
118
118
119
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
120
+
119
121
#define CC_PASCAL 600
120
122
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
121
123
#define CC_VOLTA 700
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
605
607
}
606
608
607
609
static __device__ __forceinline__ half2 warp_reduce_sum (half2 a) {
608
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
609
- (void ) a;
610
- bad_arch ();
611
- #else
610
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
612
611
#pragma unroll
613
612
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
614
613
a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, mask, 32 ));
615
614
}
616
615
return a;
617
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
616
+ #else
617
+ (void ) a;
618
+ bad_arch ();
619
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
618
620
}
619
621
620
622
static __device__ __forceinline__ float warp_reduce_max (float x) {
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
626
628
}
627
629
628
630
static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
629
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
630
- (void ) x;
631
- bad_arch ();
632
- #else
631
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
633
632
#pragma unroll
634
633
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
635
634
x = __hmax2 (x, __shfl_xor_sync (0xffffffff , x, mask, 32 ));
636
635
}
637
636
return x;
638
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
637
+ #else
638
+ (void ) x;
639
+ bad_arch ();
640
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
639
641
}
640
642
641
643
static __device__ __forceinline__ float op_repeat (const float a, const float b) {
@@ -5613,7 +5615,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
5613
5615
5614
5616
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5615
5617
static __global__ void soft_max_f16 (const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5616
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5618
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5617
5619
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5618
5620
const int ncols_smem = GGML_PAD (ncols_data, 2 *WARP_SIZE)/2 ;
5619
5621
@@ -5738,7 +5740,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5738
5740
#else
5739
5741
(void ) x; (void ) y; (void ) dst; (void ) ncols_par; (void ) nrows_y; (void ) scale;
5740
5742
bad_arch ();
5741
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5743
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5742
5744
}
5743
5745
5744
5746
template <bool vals_smem, int ncols_template, int block_size_template>
@@ -8574,15 +8576,15 @@ static void ggml_cuda_op_soft_max(
8574
8576
float scale = 1 .0f ;
8575
8577
memcpy (&scale, dst->op_params , sizeof (float ));
8576
8578
8577
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8578
- const bool use_f16_soft_max = false ;
8579
- #else
8579
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
8580
8580
#ifdef GGML_CUDA_F16
8581
8581
const bool use_f16_soft_max = true ;
8582
8582
#else
8583
8583
const bool use_f16_soft_max = false ;
8584
8584
#endif // GGML_CUDA_F16
8585
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8585
+ #else
8586
+ const bool use_f16_soft_max = false ;
8587
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
8586
8588
8587
8589
if (use_f16_soft_max) {
8588
8590
soft_max_f16_cuda (src0_dd, src1 ? src1_dd : nullptr , dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
0 commit comments