Skip to content

Commit 0ee6334

Browse files
ggerganovteleprint-me
authored andcommitted
ggml : drop support for QK_K=64 (ggml-org#7473)
* ggml : drop support for QK_K=64 ggml-ci * opencl : restore QK_K=256 define
1 parent f606b75 commit 0ee6334

File tree

16 files changed

+1741
-5764
lines changed

16 files changed

+1741
-5764
lines changed

CMakeLists.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
124124
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
125125
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
126126
option(LLAMA_RPC "llama: use RPC" OFF)
127-
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
128127
option(LLAMA_SYCL "llama: use SYCL" OFF)
129128
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
130129
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@@ -384,10 +383,6 @@ if (LLAMA_LLAMAFILE)
384383
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
385384
endif()
386385

387-
if (LLAMA_QKK_64)
388-
add_compile_definitions(GGML_QKK_64)
389-
endif()
390-
391386
if (LLAMA_CUBLAS)
392387
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
393388
set(LLAMA_CUDA ON)

Makefile

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,6 @@ else
389389
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
390390
endif
391391

392-
ifdef LLAMA_QKK_64
393-
MK_CPPFLAGS += -DGGML_QKK_64
394-
endif
395-
396392
ifndef LLAMA_NO_ACCELERATE
397393
# Mac OS - include Accelerate framework.
398394
# `-framework Accelerate` works both with Apple Silicon and Mac Intel

ci/run.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
606606

607607
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
608608
if [ -z ${GG_BUILD_CUDA} ]; then
609-
test $ret -eq 0 && gg_run open_llama_3b_v2
609+
#test $ret -eq 0 && gg_run open_llama_3b_v2
610+
date # dummy
610611
else
611612
test $ret -eq 0 && gg_run open_llama_7b_v2
612613
fi

ggml-common.h

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
6565
// QK = number of values after dequantization
6666
// QK_K = super-block size
6767

68-
#ifdef GGML_QKK_64
69-
#define QK_K 64
70-
#define K_SCALE_SIZE 4
71-
#else
7268
#define QK_K 256
7369
#define K_SCALE_SIZE 12
74-
#endif // GGML_QKK_64
7570

7671
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
7772
// QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131126
#define QI4_NL (QK4_NL / (4*QR4_NL))
132127
#define QR4_NL 2
133128

134-
#if QK_K == 64
135-
#define QI4_XS QI4_NL
136-
#define QR4_XS QR4_NL
137-
#else
138129
#define QI4_XS (QK_K / (4*QR4_XS))
139130
#define QR4_XS 8
140-
#endif
141131

142132
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143133

@@ -228,36 +218,18 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228218
// weight is represented as x = a * q
229219
// 16 blocks of 16 elements each
230220
// Effectively 3.4375 bits per weight
231-
#ifdef GGML_QKK_64
232-
typedef struct {
233-
uint8_t hmask[QK_K/8]; // quants - high bit
234-
uint8_t qs[QK_K/4]; // quants - low 2 bits
235-
uint8_t scales[2];
236-
ggml_half d; // super-block scale
237-
} block_q3_K;
238-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239-
#else
240221
typedef struct {
241222
uint8_t hmask[QK_K/8]; // quants - high bit
242223
uint8_t qs[QK_K/4]; // quants - low 2 bits
243224
uint8_t scales[12]; // scales, quantized with 6 bits
244225
ggml_half d; // super-block scale
245226
} block_q3_K;
246227
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247-
#endif
248228

249229
// 4-bit quantization
250230
// 8 blocks of 32 elements each
251231
// weight is represented as x = a * q + b
252232
// Effectively 4.5 bits per weight
253-
#ifdef GGML_QKK_64
254-
typedef struct {
255-
ggml_half d[2]; // super-block scales/mins
256-
uint8_t scales[2]; // 4-bit block scales/mins
257-
uint8_t qs[QK_K/2]; // 4--bit quants
258-
} block_q4_K;
259-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260-
#else
261233
typedef struct {
262234
union {
263235
struct {
@@ -270,21 +242,11 @@ typedef struct {
270242
uint8_t qs[QK_K/2]; // 4--bit quants
271243
} block_q4_K;
272244
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273-
#endif
274245

275246
// 5-bit quantization
276247
// 8 blocks of 32 elements each
277248
// weight is represented as x = a * q + b
278249
// Effectively 5.5 bits per weight
279-
#ifdef GGML_QKK_64
280-
typedef struct {
281-
ggml_half d; // super-block scale
282-
int8_t scales[QK_K/16]; // 8-bit block scales
283-
uint8_t qh[QK_K/8]; // quants, high bit
284-
uint8_t qs[QK_K/2]; // quants, low 4 bits
285-
} block_q5_K;
286-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287-
#else
288250
typedef struct {
289251
union {
290252
struct {
@@ -298,7 +260,6 @@ typedef struct {
298260
uint8_t qs[QK_K/2]; // quants, low 4 bits
299261
} block_q5_K;
300262
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301-
#endif
302263

303264
// 6-bit quantization
304265
// weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356317
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357318

358319
// 3.4375 bpw
359-
#if QK_K == 64
360-
#define IQ3S_N_SCALE 2
361-
#else
362320
#define IQ3S_N_SCALE QK_K/64
363-
#endif
364321
typedef struct {
365322
ggml_half d;
366323
uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381338
typedef struct {
382339
uint8_t qs[QK_K/8]; // grid index, low 8 bits
383340
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384-
#if QK_K == 64
385-
ggml_half d;
386-
#endif
387341
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388342
} block_iq1_m;
389-
#if QK_K == 64
390-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391-
#else
392343
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393-
#endif
394344

395345
// Used by IQ1_M quants
396346
typedef union {
@@ -406,17 +356,13 @@ typedef struct {
406356
} block_iq4_nl;
407357
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408358

409-
#if QK_K == 64
410-
#define block_iq4_xs block_iq4_nl
411-
#else
412359
typedef struct {
413360
ggml_half d;
414361
uint16_t scales_h;
415362
uint8_t scales_l[QK_K/64];
416363
uint8_t qs[QK_K/2];
417364
} block_iq4_xs;
418365
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419-
#endif
420366

421367
#endif // GGML_COMMON_DECL
422368
#endif // GGML_COMMON_DECL

0 commit comments

Comments
 (0)