Skip to content

Commit 151ff95

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile # README.md # ggml-cuda.cu # ggml-cuda/common.cuh
2 parents 12dfb92 + f702a90 commit 151ff95

File tree

8 files changed

+109
-116
lines changed

8 files changed

+109
-116
lines changed

common/common.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1539,9 +1539,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
15391539
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
15401540
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
15411541
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
1542-
options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
1542+
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
1543+
"note: this argument can be repeated to add multiple control vectors" });
15431544
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
1544-
"add a control vector with user defined scaling SCALE" });
1545+
"add a control vector with user defined scaling SCALE\n"
1546+
"note: this argument can be repeated to add multiple scaled control vectors" });
15451547
options.push_back({ "*", " --control-vector-layer-range START END",
15461548
"layer range to apply the control vector(s) to, start and end inclusive" });
15471549
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"

ggml-cuda.cu

Lines changed: 33 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -154,16 +154,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
154154
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
155155

156156
int64_t total_vram = 0;
157-
// #if defined(GGML_CUDA_FORCE_MMQ)
158-
// GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
157+
// #ifdef GGML_CUDA_FORCE_MMQ
158+
// GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
159159
// #else
160-
// GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
161-
// #endif
162-
// #if defined(CUDA_USE_TENSOR_CORES)
163-
// GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
160+
// GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
161+
// #endif // GGML_CUDA_FORCE_MMQ
162+
// #ifdef GGML_CUDA_FORCE_CUBLAS
163+
// GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
164164
// #else
165-
// GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
166-
// #endif
165+
// GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
166+
// #endif // GGML_CUDA_FORCE_CUBLAS
167167
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
168168
for (int id = 0; id < info.device_count; ++id) {
169169
int device_vmm = 0;
@@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18731873
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
18741874
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
18751875

1876-
int64_t min_compute_capability = INT_MAX;
1876+
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
1877+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1878+
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
1879+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1880+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1881+
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1882+
bool use_mul_mat_q = ggml_is_quantized(src0->type)
1883+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1884+
1885+
bool any_gpus_with_slow_fp16 = false;
18771886

1878-
bool any_pascal_with_slow_fp16 = false;
18791887
if (split) {
18801888
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
18811889
auto & tensor_split = buft_ctx->tensor_split;
@@ -1885,62 +1893,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18851893
continue;
18861894
}
18871895

1888-
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
1889-
min_compute_capability = ggml_cuda_info().devices[id].cc;
1890-
}
1891-
if (ggml_cuda_info().devices[id].cc == 610) {
1892-
any_pascal_with_slow_fp16 = true;
1893-
}
1896+
const int cc = ggml_cuda_info().devices[id].cc;
1897+
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
1898+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1899+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
18941900
}
18951901
} else {
1896-
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
1897-
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
1898-
}
1899-
1900-
// check data types and tensor shapes for custom matrix multiplication kernels:
1901-
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
1902-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1903-
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
1904-
1905-
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1906-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1907-
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1908-
1909-
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
1910-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1911-
1912-
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1913-
1914-
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
1915-
1916-
if(!g_mul_mat_q)
1917-
{
1918-
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
1919-
}
1920-
1921-
#else
1922-
1923-
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
1924-
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
1925-
1926-
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
1927-
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
1928-
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
1929-
1930-
if(!g_mul_mat_q)
1931-
{
1932-
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
1902+
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1903+
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
1904+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1905+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
19331906
}
19341907

1935-
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1936-
19371908
// if mmvq is available it's a better choice than dmmv:
19381909
#ifndef GGML_CUDA_FORCE_DMMV
19391910
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
19401911
#endif // GGML_CUDA_FORCE_DMMV
19411912

1942-
const bool use_tensor_cores = fp16_performance_good && !g_mul_mat_q;
1943-
19441913
// debug helpers
19451914
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
19461915
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -1949,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19491918
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
19501919
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
19511920

1952-
if (!split && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
1953-
// KQ single-batch
1921+
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
1922+
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
19541923
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
1955-
} else if (!split && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1956-
// KQV single-batch
1924+
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1925+
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
19571926
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1958-
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || use_tensor_cores) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1959-
// KQ + KQV multi-batch
1927+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
1928+
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1929+
// KQ + KQV multi-batch without FlashAttention
19601930
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
19611931
} else if (use_dequantize_mul_mat_vec) {
19621932
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);

ggml-cuda/common.cuh

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -146,23 +146,6 @@
146146
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
147147
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
148148

149-
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
150-
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
151-
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
152-
// - 7B quantum model: +100-200 MB
153-
// - 13B quantum model: +200-400 MB
154-
//
155-
#define GGML_CUDA_FORCE_MMQ
156-
157-
// TODO: improve this to be correct for more hardware
158-
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
159-
#if !defined(GGML_CUDA_FORCE_MMQ)
160-
#define CUDA_USE_TENSOR_CORES
161-
#endif
162-
163-
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
164-
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
165-
166149
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
167150

168151
#if defined(_MSC_VER)
@@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
343326
#define INT8_MMA_AVAILABLE
344327
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
345328

346-
static bool fast_fp16_available(const int cc) {
329+
static constexpr bool fast_fp16_available(const int cc) {
347330
return cc >= CC_PASCAL && cc != 610;
348331
}
349332

350-
static bool fp16_mma_available(const int cc) {
333+
static constexpr bool fp16_mma_available(const int cc) {
351334
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
352335
}
353336

354-
static bool int8_mma_available(const int cc) {
337+
static constexpr bool int8_mma_available(const int cc) {
355338
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
356339
}
357340

@@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
643626
static constexpr int qi = QI3_S;
644627
};
645628

646-
static constexpr int get_mmq_x_max_host(int cc) {
647-
#ifdef CUDA_USE_TENSOR_CORES
648-
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
649-
#else
650-
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
651-
#endif // CUDA_USE_TENSOR_CORES
652-
}
653-
654-
// Round rows to this value for --split-mode row:
655-
static constexpr int get_mmq_y_host(int cc) {
656-
return cc >= CC_VOLTA ? 128 : 64;
657-
}
658-
659629
//////////////////////
660630

661631
struct ggml_cuda_device_info {

ggml-cuda/mmq.cu

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,15 @@ void ggml_cuda_op_mul_mat_q(
6969
GGML_UNUSED(src1_ddf_i);
7070
}
7171

72-
bool ggml_cuda_supports_mmq(enum ggml_type type) {
72+
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
73+
74+
if(!g_mul_mat_q)
75+
{
76+
return false;
77+
}
78+
79+
bool mmq_supported;
80+
7381
switch (type) {
7482
case GGML_TYPE_Q4_0:
7583
case GGML_TYPE_Q4_1:
@@ -81,8 +89,33 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
8189
case GGML_TYPE_Q4_K:
8290
case GGML_TYPE_Q5_K:
8391
case GGML_TYPE_Q6_K:
84-
return true;
92+
mmq_supported = true;
93+
break;
8594
default:
86-
return false;
95+
mmq_supported = false;
96+
break;
97+
}
98+
99+
if (!mmq_supported) {
100+
return false;
87101
}
102+
103+
if (int8_mma_available(cc)) {
104+
return true;
105+
}
106+
107+
if (cc < MIN_CC_DP4A) {
108+
return false;
109+
}
110+
111+
if(g_mul_mat_q)
112+
{
113+
return true;
114+
}
115+
116+
if (cc < CC_OFFSET_AMD) {
117+
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
118+
}
119+
120+
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
88121
}

ggml-cuda/mmq.cuh

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include <climits>
88
#include <cstdint>
99

10+
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
11+
1012
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride);
1113
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k0);
1214
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);
@@ -24,25 +26,42 @@ struct tile_x_sizes {
2426
int sc;
2527
};
2628

27-
// get_mmq_x_max_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
29+
static constexpr int get_mmq_x_max_host(const int cc) {
30+
return int8_mma_available(cc) ? 128 :
31+
#ifdef GGML_CUDA_FORCE_MMQ
32+
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
33+
#else
34+
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
35+
#endif // GGML_CUDA_FORCE_MMQ
36+
}
2837

2938
static constexpr __device__ int get_mmq_x_max_device() {
39+
#ifdef INT8_MMA_AVAILABLE
40+
return 128;
41+
#else // INT8_MMA_AVAILABLE
42+
3043
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
31-
return 64;
32-
#else
44+
return 128;
45+
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
46+
3347
#if __CUDA_ARCH__ >= CC_VOLTA
34-
#ifdef CUDA_USE_TENSOR_CORES
35-
return MMQ_MAX_BATCH_SIZE;
36-
#else
48+
#ifdef GGML_CUDA_FORCE_MMQ
49+
return MMQ_DP4A_MAX_BATCH_SIZE;
50+
#else // GGML_CUDA_FORCE_MMQ
3751
return 128;
38-
#endif // CUDA_USE_TENSOR_CORES
39-
#else
52+
#endif // GGML_CUDA_FORCE_MMQ
53+
#else // __CUDA_ARCH__ >= CC_VOLTA
54+
4055
return 64;
4156
#endif // __CUDA_ARCH__ >= CC_VOLTA
57+
4258
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
59+
#endif // INT8_MMA_AVAILABLE
4360
}
4461

45-
// get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
62+
static constexpr int get_mmq_y_host(const int cc) {
63+
return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64;
64+
}
4665

4766
static constexpr __device__ int get_mmq_y_device() {
4867
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@@ -2035,15 +2054,13 @@ static __device__ __forceinline__ void mmq_write_back_mma(
20352054
static_assert(nwarps*mma_C::I == mmq_y, "nwarps*mma_C::I != mmq_y");
20362055
#endif // INT8_MMA_AVAILABLE
20372056

2038-
dst += (threadIdx.y % ntx) * mma_C::J*stride;
2039-
20402057
#pragma unroll
20412058
for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
20422059
#pragma unroll
20432060
for (int n = 0; n < ntx; ++n) {
20442061
#pragma unroll
20452062
for (int l = 0; l < mma_C::ne; ++l) {
2046-
const int j = j0 + mma_C::get_j(l);
2063+
const int j = j0 + (threadIdx.y % ntx) * mma_C::J + mma_C::get_j(l);
20472064

20482065
if (j > j_max) {
20492066
continue;
@@ -2590,4 +2607,4 @@ void ggml_cuda_op_mul_mat_q(
25902607
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
25912608
const int64_t src1_padded_row_size, cudaStream_t stream);
25922609

2593-
bool ggml_cuda_supports_mmq(enum ggml_type type);
2610+
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);

ggml-cuda/mmvq.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "common.cuh"
22

3+
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
4+
35
void ggml_cuda_op_mul_mat_vec_q(
46
ggml_backend_cuda_context & ctx,
57
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

ggml-sycl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4620,7 +4620,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
46204620
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
46214621
// KQV single-batch
46224622
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
4623-
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
4623+
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
46244624
// KQ + KQV multi-batch
46254625
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
46264626
} else if (use_dequantize_mul_mat_vec) {

koboldcpp.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2079,7 +2079,6 @@ def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to u
20792079
gui_layers_zeroed = gpulayers_var.get()=="" or gpulayers_var.get()=="0"
20802080
if (gui_layers_untouched or gui_layers_zeroed) and layerlimit>0:
20812081
gpulayers_var.set(str(layerlimit))
2082-
mmq_var.set(0 if layerlimit>=200 else 1)
20832082
gui_layers_untouched = old_gui_layers_untouched
20842083
if gui_layers_zeroed:
20852084
gui_layers_untouched = True

0 commit comments

Comments
 (0)