Skip to content

Commit 76d66ee

Browse files
CUDA: faster q2_K, q3_K MMQ + int8 tensor cores (#7921)
* CUDA: faster q2_K, q3_K MMQ + int8 tensor cores * try CI fix * try CI fix * try CI fix * fix data race * rever q2_K precision related changes
1 parent 66ef1ce commit 76d66ee

File tree

6 files changed

+457
-319
lines changed

6 files changed

+457
-319
lines changed

ggml-cuda.cu

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
188188
info.default_tensor_split[id] = total_vram;
189189
total_vram += prop.totalGlobalMem;
190190

191+
info.devices[id].nsm = prop.multiProcessorCount;
192+
info.devices[id].smpb = prop.sharedMemPerBlock;
191193
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
194+
info.devices[id].smpbo = prop.sharedMemPerBlock;
192195
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
193196
#else
197+
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
194198
info.devices[id].cc = 100*prop.major + 10*prop.minor;
195199
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
196-
info.devices[id].smpb = prop.sharedMemPerBlock;
197-
info.devices[id].nsm = prop.multiProcessorCount;
198200
}
199201

200202
for (int id = 0; id < info.device_count; ++id) {

ggml-cuda/argsort.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
7373
const dim3 block_nums(1, nrows, 1);
7474
const size_t shared_mem = ncols_pad * sizeof(int);
7575

76+
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
7677
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
7778

7879
if (order == GGML_SORT_ORDER_ASC) {

ggml-cuda/common.cuh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,10 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
331331
#define FP16_AVAILABLE
332332
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
333333

334+
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
335+
#define FAST_FP16_AVAILABLE
336+
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
337+
334338
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
335339
#define FP16_MMA_AVAILABLE
336340
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
@@ -661,6 +665,7 @@ struct ggml_cuda_device_info {
661665
int cc; // compute capability
662666
int nsm; // number of streaming multiprocessors
663667
size_t smpb; // max. shared memory per block
668+
size_t smpbo; // max. shared memory per block (with opt-in)
664669
bool vmm; // virtual memory support
665670
size_t vmm_granularity; // granularity of virtual memory
666671
size_t total_vram;

0 commit comments

Comments
 (0)