Skip to content

Commit 04eec58

Browse files
committed
ggml : remove q1_3 and q2_2
* llama : remove the separate scale tensors of BitNet b1.58 They won't be needed, since the remaining ternary quant types have built-in scales.
1 parent 45719a2 commit 04eec58

File tree

12 files changed

+45
-693
lines changed

12 files changed

+45
-693
lines changed

convert_hf_to_gguf.py

Lines changed: 9 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -284,9 +284,6 @@ def prepare_tensors(self):
284284

285285
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
286286
data: np.ndarray # type hint
287-
if len(data.shape) == 0:
288-
# otherwise single-value tensors get squeezed
289-
data = data.reshape((1,))
290287
n_dims = len(data.shape)
291288
data_dtype = data.dtype
292289
data_qtype: gguf.GGMLQuantizationType | None = None
@@ -317,33 +314,12 @@ def prepare_tensors(self):
317314
))
318315

319316
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
320-
# TODO: cleaner model-specific per-tensor types
321-
# NOTE: Q1_3 is only relevant for BitNet b1.58
322-
if (
323-
self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
324-
and gguf.can_quantize_to_q1_3(data)
325-
and not any(
326-
self.match_model_tensor_name(new_name, key, None)
327-
for key in [
328-
gguf.MODEL_TENSOR.TOKEN_EMBD,
329-
gguf.MODEL_TENSOR.OUTPUT,
330-
]
331-
)
332-
):
333-
data = gguf.quantize_q1_3(data)
334-
assert data.dtype == np.uint8
335-
data_qtype = gguf.GGMLQuantizationType.Q1_3
336-
337-
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
317+
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
338318
data = gguf.quantize_bf16(data)
339319
assert data.dtype == np.int16
340320
data_qtype = gguf.GGMLQuantizationType.BF16
341321

342-
elif (
343-
self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0
344-
or self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
345-
and gguf.can_quantize_to_q8_0(data)
346-
):
322+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
347323
data = gguf.quantize_q8_0(data)
348324
assert data.dtype == np.uint8
349325
data_qtype = gguf.GGMLQuantizationType.Q8_0
@@ -1635,12 +1611,6 @@ def prepare_tensors(self):
16351611
class BitnetModel(Model):
16361612
model_arch = gguf.MODEL_ARCH.BITNET
16371613

1638-
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, *args, **kwargs):
1639-
if ftype == gguf.LlamaFileType.GUESSED:
1640-
ftype = gguf.LlamaFileType.MOSTLY_Q1_3
1641-
1642-
super().__init__(dir_model, ftype, *args, **kwargs)
1643-
16441614
def set_vocab(self):
16451615
self._set_vocab_sentencepiece()
16461616

@@ -1649,16 +1619,16 @@ def set_gguf_parameters(self):
16491619
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
16501620
self.gguf_writer.add_rope_scaling_factor(1.0)
16511621

1652-
def weight_quant(self, weight):
1622+
def weight_quant(self, weight: Tensor) -> Tensor:
16531623
dtype = weight.dtype
16541624
weight = weight.float()
16551625
scale = weight.abs().mean().clamp(min=1e-5)
16561626
iscale = 1 / scale
1657-
weight = (weight * iscale).round().clamp(-1, 1)
1658-
# TODO: use the scale directly instead of inverting it twice
1627+
# TODO: multiply by the scale directly instead of inverting it twice
16591628
# (this is also unnecessarily doubly inverted upstream)
16601629
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
1661-
return weight.type(dtype), (1 / iscale).type(torch.float32)
1630+
result = (weight * iscale).round().clamp(-1, 1) / iscale
1631+
return result.type(dtype)
16621632

16631633
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
16641634
new_name = self.map_tensor_name(name)
@@ -1673,11 +1643,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
16731643
gguf.MODEL_TENSOR.FFN_GATE,
16741644
]):
16751645
# transform weight into 1/0/-1 (in fp32)
1676-
weight_torch, scale_torch = self.weight_quant(data_torch)
1677-
yield (new_name, weight_torch)
1678-
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1679-
else:
1680-
yield (new_name, data_torch)
1646+
data_torch = self.weight_quant(data_torch)
1647+
1648+
yield (new_name, data_torch)
16811649

16821650

16831651
@Model.register("GrokForCausalLM")

examples/quantize/quantize.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
2828
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
2929
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
3030
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
31-
{ "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", },
32-
{ "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", },
3331
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
3432
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
3533
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },

ggml/include/ggml.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,6 @@ extern "C" {
392392
GGML_TYPE_Q4_0_8_8 = 33,
393393
GGML_TYPE_TQ1_0 = 34,
394394
GGML_TYPE_TQ2_0 = 35,
395-
GGML_TYPE_Q2_2 = 36,
396-
GGML_TYPE_Q1_3 = 37,
397395
GGML_TYPE_COUNT,
398396
};
399397

ggml/src/ggml-common.h

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -141,20 +141,6 @@ typedef sycl::half2 ggml_half2;
141141

142142
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143143

144-
// 1.625 bpw for BitNet b1.58 models
145-
#define QK1_3 64
146-
typedef struct {
147-
uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256)
148-
uint8_t qs[QK1_3/64]; // 4 elements per byte
149-
} block_q1_3;
150-
static_assert(sizeof(block_q1_3) == (QK1_3 - 4*QK1_3/64)/5 + QK1_3/64, "wrong q1_3 block size/padding");
151-
152-
#define QK2_2 32
153-
typedef struct {
154-
uint8_t qs[QK2_2 / 4]; // nibbles / quants
155-
} block_q2_2;
156-
static_assert(sizeof(block_q2_2) == QK2_2 / 4, "wrong q2_2 block size/padding");
157-
158144
#define QK4_0 32
159145
typedef struct {
160146
ggml_half d; // delta
@@ -1084,41 +1070,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
10841070
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
10851071
GGML_TABLE_END()
10861072

1087-
GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256)
1088-
0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff,
1089-
0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001,
1090-
0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff,
1091-
0xff010000, 0xff010001, 0xff0101ff, 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01,
1092-
0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00,
1093-
0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101,
1094-
0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, 0x00010001, 0x000101ff, 0x00010100,
1095-
0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001,
1096-
0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000,
1097-
0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x0101ff01,
1098-
0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00,
1099-
0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff,
1100-
0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100,
1101-
0xff000101, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff,
1102-
0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0000,
1103-
0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff,
1104-
0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01,
1105-
0x000100ff, 0x00010000, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff,
1106-
0x01ffff00, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101,
1107-
0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x01000001, 0x010001ff,
1108-
0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001,
1109-
0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000,
1110-
0xffff0001, 0xffff01ff, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01,
1111-
0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ff00,
1112-
0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, 0xff0101ff, 0xff010100, 0xff010101,
1113-
0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100,
1114-
0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff,
1115-
0x00000100, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000,
1116-
0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ff00ff,
1117-
0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x01ff0101, 0x0100ffff, 0x0100ff00,
1118-
0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff,
1119-
0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101,
1120-
GGML_TABLE_END()
1121-
11221073
#define NGRID_IQ1S 2048
11231074
#define IQ1S_DELTA 0.125f
11241075
#define IQ1M_DELTA 0.125f

0 commit comments

Comments
 (0)