We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 6b12c2e commit 9fb82afCopy full SHA for 9fb82af
ggml/src/ggml-cuda/mmvq.cu
@@ -72,10 +72,13 @@ static __device__ void mul_mat_vec_q(
72
73
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
74
75
+ //int64_t rows_per_cuda_block = ggml_cuda_info().devices[id].cc < CC_RDNA2 ?
76
+ // ncols_y < 4 ? 1 : 2 : 1;
77
+
78
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
79
constexpr int rows_per_cuda_block = 1;
80
#else
- constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
81
+ constexpr int rows_per_cuda_block = ncols_y < 4 ? 1 : 2;
82
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
83
84
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
0 commit comments