ggml-org
diff --git a/‎convert_hf_to_gguf.py
Lines changed: 9 additions & 41 deletions b/‎convert_hf_to_gguf.py
Lines changed: 9 additions & 41 deletions
diff --git a/‎examples/quantize/quantize.cpp
Lines changed: 0 additions & 2 deletions b/‎examples/quantize/quantize.cpp
Lines changed: 0 additions & 2 deletions
diff --git a/‎ggml/include/ggml.h
Lines changed: 0 additions & 2 deletions b/‎ggml/include/ggml.h
Lines changed: 0 additions & 2 deletions
diff --git a/‎ggml/src/ggml-common.h
Lines changed: 0 additions & 49 deletions b/‎ggml/src/ggml-common.h
Lines changed: 0 additions & 49 deletions
@@ -284,9 +284,6 @@ def prepare_tensors(self):
 
             for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
                 data: np.ndarray  # type hint
-                if len(data.shape) == 0:
-                    # otherwise single-value tensors get squeezed
-                    data = data.reshape((1,))
                 n_dims = len(data.shape)
                 data_dtype = data.dtype
                 data_qtype: gguf.GGMLQuantizationType | None = None
@@ -317,33 +314,12 @@ def prepare_tensors(self):
                 ))
 
                 if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
-                    # TODO: cleaner model-specific per-tensor types
-                    # NOTE: Q1_3 is only relevant for BitNet b1.58
-                    if (
-                        self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
-                        and gguf.can_quantize_to_q1_3(data)
-                        and not any(
-                            self.match_model_tensor_name(new_name, key, None)
-                            for key in [
-                                gguf.MODEL_TENSOR.TOKEN_EMBD,
-                                gguf.MODEL_TENSOR.OUTPUT,
-                            ]
-                        )
-                    ):
-                        data = gguf.quantize_q1_3(data)
-                        assert data.dtype == np.uint8
-                        data_qtype = gguf.GGMLQuantizationType.Q1_3
-
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                    if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                         data = gguf.quantize_bf16(data)
                         assert data.dtype == np.int16
                         data_qtype = gguf.GGMLQuantizationType.BF16
 
-                    elif (
-                        self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0
-                        or self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
-                        and gguf.can_quantize_to_q8_0(data)
-                    ):
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
                         data = gguf.quantize_q8_0(data)
                         assert data.dtype == np.uint8
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
@@ -1635,12 +1611,6 @@ def prepare_tensors(self):
 class BitnetModel(Model):
     model_arch = gguf.MODEL_ARCH.BITNET
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, *args, **kwargs):
-        if ftype == gguf.LlamaFileType.GUESSED:
-            ftype = gguf.LlamaFileType.MOSTLY_Q1_3
-
-        super().__init__(dir_model, ftype, *args, **kwargs)
-
     def set_vocab(self):
         self._set_vocab_sentencepiece()
 
@@ -1649,16 +1619,16 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
         self.gguf_writer.add_rope_scaling_factor(1.0)
 
-    def weight_quant(self, weight):
+    def weight_quant(self, weight: Tensor) -> Tensor:
         dtype = weight.dtype
         weight = weight.float()
         scale = weight.abs().mean().clamp(min=1e-5)
         iscale = 1 / scale
-        weight = (weight * iscale).round().clamp(-1, 1)
-        # TODO: use the scale directly instead of inverting it twice
+        # TODO: multiply by the scale directly instead of inverting it twice
         # (this is also unnecessarily doubly inverted upstream)
         # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
-        return weight.type(dtype), (1 / iscale).type(torch.float32)
+        result = (weight * iscale).round().clamp(-1, 1) / iscale
+        return result.type(dtype)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         new_name = self.map_tensor_name(name)
@@ -1673,11 +1643,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             gguf.MODEL_TENSOR.FFN_GATE,
         ]):
             # transform weight into 1/0/-1 (in fp32)
-            weight_torch, scale_torch = self.weight_quant(data_torch)
-            yield (new_name, weight_torch)
-            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
-        else:
-            yield (new_name, data_torch)
+            data_torch = self.weight_quant(data_torch)
+
+        yield (new_name, data_torch)
 
 
 @Model.register("GrokForCausalLM")
 
@@ -28,8 +28,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
     { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
     { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
-    { "Q1_3",     LLAMA_FTYPE_MOSTLY_Q1_3,     " 1.63 bpw for BitNet b1.58",        },
-    { "Q2_2",     LLAMA_FTYPE_MOSTLY_Q2_2,     " 2.00 bpw for BitNet b1.58",        },
     { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
     { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
     { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
 
@@ -392,8 +392,6 @@ extern "C" {
         GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
-        GGML_TYPE_Q2_2    = 36,
-        GGML_TYPE_Q1_3    = 37,
         GGML_TYPE_COUNT,
     };
 
 
@@ -141,20 +141,6 @@ typedef sycl::half2 ggml_half2;
 
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
 
-// 1.625 bpw for BitNet b1.58 models
-#define QK1_3 64
-typedef struct {
-    uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256)
-    uint8_t qs[QK1_3/64]; // 4 elements per byte
-} block_q1_3;
-static_assert(sizeof(block_q1_3) == (QK1_3 - 4*QK1_3/64)/5 + QK1_3/64, "wrong q1_3 block size/padding");
-
-#define QK2_2 32
-typedef struct {
-    uint8_t qs[QK2_2 / 4]; // nibbles / quants
-} block_q2_2;
-static_assert(sizeof(block_q2_2) == QK2_2 / 4, "wrong q2_2 block size/padding");
-
 #define QK4_0 32
 typedef struct {
     ggml_half d;           // delta
@@ -1084,41 +1070,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 GGML_TABLE_END()
 
-GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256)
-    0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff,
-    0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001,
-    0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff,
-    0xff010000, 0xff010001, 0xff0101ff, 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01,
-    0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00,
-    0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101,
-    0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, 0x00010001, 0x000101ff, 0x00010100,
-    0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001,
-    0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000,
-    0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x0101ff01,
-    0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00,
-    0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff,
-    0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100,
-    0xff000101, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff,
-    0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0000,
-    0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff,
-    0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01,
-    0x000100ff, 0x00010000, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff,
-    0x01ffff00, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101,
-    0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x01000001, 0x010001ff,
-    0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001,
-    0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000,
-    0xffff0001, 0xffff01ff, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01,
-    0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ff00,
-    0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, 0xff0101ff, 0xff010100, 0xff010101,
-    0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100,
-    0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff,
-    0x00000100, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000,
-    0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ff00ff,
-    0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x01ff0101, 0x0100ffff, 0x0100ff00,
-    0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff,
-    0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101,
-GGML_TABLE_END()
-
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f